From ad6aae45838a5a05068956860dac3e41035b9dd8 Mon Sep 17 00:00:00 2001 From: yao Date: Tue, 26 Mar 2013 13:41:13 +0800 Subject: [PATCH] more fix of mismatch functions on CPU OCL --- modules/ocl/src/brute_force_matcher.cpp | 692 +++++--------------------- modules/ocl/src/haar.cpp | 4 +- modules/ocl/src/moments.cpp | 6 +- modules/ocl/src/opencl/brute_force_match.cl | 315 ++++-------- modules/ocl/src/opencl/haarobjectdetect.cl | 8 + modules/ocl/test/test_brute_force_matcher.cpp | 4 +- 6 files changed, 231 insertions(+), 798 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 818f3c1..e61a9f6 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -44,6 +44,7 @@ //M*/ #include "precomp.hpp" + #include #include #include @@ -60,10 +61,11 @@ namespace cv } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -91,20 +93,21 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_UnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -130,21 +133,22 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_Match"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } //radius_matchUnrolledCached -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -176,15 +180,16 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } //radius_match -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + assert(query.type() == CV_32F); cv::ocl::Context *ctx = query.clCxt; size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; @@ -214,263 +219,70 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c std::string kernelName = "BruteForceMatch_RadiusMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); - //float *dis = (float *)clEnqueueMapBuffer(ctx->impl->clCmdQueue, (cl_mem)distance.data, CL_TRUE, CL_MAP_READ, 0, 8, 0, NULL, NULL, NULL); - //printf("%f, %f\n", dis[0], dis[1]); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -// with mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); - }*/ - else - { - match<16, T>(query, train, mask, trainIdx, distance, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &trainIdx, const oclMat &distance, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); + matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); + matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx, distance, distType); } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance); - }*/ else { - match<16, T>(query, train, mask, trainIdx, distance, distType); + match<16>(query, train, tempMask, trainIdx, distance, distType); } } -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask, const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); + matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ else { - match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } -} - -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &trainIdx, - const oclMat &imgIdx, const oclMat &distance, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ - else - { - match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType); + match<16>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); } } //radius matchDispatcher -// with mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, +static void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (query.cols <= 64) { - matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); + matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } else if (query.cols <= 128) { - matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ - else - { - radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &trainIdx, - const oclMat &distance, const oclMat &nMatches, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ else { - radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } -} - -// without mask -template < typename T/*, typename Mask*/ > -void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &trainIdx, - const oclMat &distance, const oclMat &nMatches, int distType) -{ - oclMat mask; - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType); + radius_match<16>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); } } //knn match Dispatcher -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { @@ -501,11 +313,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl std::string kernelName = "BruteForceMatch_knnUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) { @@ -534,11 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_knnMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -567,11 +379,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_calcDistanceUnrolled"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } -template < int BLOCK_SIZE, typename T/*, typename Mask*/ > +template < int BLOCK_SIZE/*, typename Mask*/ > void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType) { cv::ocl::Context *ctx = query.clCxt; @@ -598,69 +410,43 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask std::string kernelName = "BruteForceMatch_calcDistance"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); } } /////////////////////////////////////////////////////////////////////////////// // Calc Distance dispatcher -template < typename T/*, typename Mask*/ > -void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) { if (query.cols <= 64) { - calcDistanceUnrolled<16, 64, T>(query, train, mask, allDist, distType); + calcDistanceUnrolled<16, 64>(query, train, mask, allDist, distType); } else if (query.cols <= 128) { - calcDistanceUnrolled<16, 128, T>(query, train, mask, allDist, distType); + calcDistanceUnrolled<16, 128>(query, train, mask, allDist, distType); } - /*else if (query.cols <= 256) - { - calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 512) - { - calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 1024) - { - calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); - }*/ else { - calcDistance<16, T>(query, train, mask, allDist, distType); + calcDistance<16>(query, train, mask, allDist, distType); } } -template < typename T/*, typename Mask*/ > -void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, +static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, int distType) { if (query.cols <= 64) { - knn_matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType); + knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType); } else if (query.cols <= 128) { - knn_matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ else { - knn_match<16, T>(query, train, mask, trainIdx, distance, distType); + knn_match<16>(query, train, mask, trainIdx, distance, distType); } } @@ -686,7 +472,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o //args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); //args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, trainIdx.depth(), -1); } } @@ -695,206 +481,22 @@ static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat & findKnnMatch<256>(k, trainIdx, distance, allDist, distType); } -//with mask -template < typename T/*, typename Mask*/ > -void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask, +static void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) { + const oclMat zeroMask; + const oclMat &tempMask = mask.data ? mask : zeroMask; if (k == 2) { - match2Dispatcher(query, train, mask, trainIdx, distance, distType); + match2Dispatcher(query, train, tempMask, trainIdx, distance, distType); } else { - calcDistanceDispatcher(query, train, mask, allDist, distType); + calcDistanceDispatcher(query, train, tempMask, allDist, distType); findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType); } } -//without mask -template < typename T/*, typename Mask*/ > -void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) -{ - oclMat mask; - if (k == 2) - { - match2Dispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - calcDistanceDispatcher(query, train, mask, allDist, distType); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType); - } -} - - - -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 0; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher< T >(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 0; - - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 1; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 1; - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance) -{ - int distType = 2; - if (mask.data) - { - matchDispatcher(query, train, mask, trainIdx, distance, distType); - } - else - { - matchDispatcher< T >(query, train, trainIdx, distance, distType); - } -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks, - const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance) -{ - int distType = 2; - if (masks.data) - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType); - } - else - { - matchDispatcher(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType); - } -} - -// knn caller -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 0; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 1; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist) -{ - int distType = 2; - - if (mask.data) - kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); - else - kmatchDispatcher(query, train, k, trainIdx, distance, allDist, distType); -} - -//radius caller -template -void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 0; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - -template -void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 1; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - -template -void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, - const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches) -{ - int distType = 2; - - if (mask.data) - matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); - else - matchDispatcher(query, train, maxDistance, trainIdx, distance, nMatches, distType); -} - cv::ocl::BruteForceMatcher_OCL_base::BruteForceMatcher_OCL_base(DistType distType_) : distType(distType_) { } @@ -929,38 +531,28 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const { if (query.empty() || train.empty()) return; + + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "singleMatch"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - typedef void (*caller_t)(const oclMat & query, const oclMat & train, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance); - - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.cols == query.cols && train.type() == query.type()); - const int nQuery = query.rows; - trainIdx.create(1, nQuery, CV_32S); - distance.create(1, nQuery, CV_32F); + trainIdx.create(1, query.rows, CV_32S); + distance.create(1, query.rows, CV_32F); - caller_t func = callers[distType][query.depth()]; - func(query, train, mask, trainIdx, distance); + matchDispatcher(query, train, mask, trainIdx, distance, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, vector &matches) @@ -1062,40 +654,27 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c if (query.empty() || trainCollection.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks, - const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "matchCollection"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, - 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, - 0/*matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); - const int nQuery = query.rows; - - trainIdx.create(1, nQuery, CV_32S); - imgIdx.create(1, nQuery, CV_32S); - distance.create(1, nQuery, CV_32F); - - caller_t func = callers[distType][query.depth()]; - CV_Assert(func != 0); + trainIdx.create(1, query.rows, CV_32S); + imgIdx.create(1, query.rows, CV_32S); + distance.create(1, query.rows, CV_32F); - func(query, trainCollection, masks, trainIdx, imgIdx, distance); + matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, vector &matches) @@ -1164,52 +743,39 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co if (query.empty() || train.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & train, int k, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance, const oclMat & allDist); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); - static const caller_t callers[3][6] = + char cvFuncName[] = "knnMatchSingle"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); - const int nQuery = query.rows; - const int nTrain = train.rows; - if (k == 2) { - trainIdx.create(1, nQuery, CV_32SC2); - distance.create(1, nQuery, CV_32FC2); + trainIdx.create(1, query.rows, CV_32SC2); + distance.create(1, query.rows, CV_32FC2); } else { - trainIdx.create(nQuery, k, CV_32S); - distance.create(nQuery, k, CV_32F); - allDist.create(nQuery, nTrain, CV_32FC1); + trainIdx.create(query.rows, k, CV_32S); + distance.create(query.rows, k, CV_32F); + allDist.create(query.rows, train.rows, CV_32FC1); } trainIdx.setTo(Scalar::all(-1)); - caller_t func = callers[distType][query.depth()]; - CV_Assert(func != 0); - - func(query, train, k, mask, trainIdx, distance, allDist); + kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, vector< vector > &matches, bool compactResult) @@ -1394,8 +960,6 @@ namespace void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector< vector > &matches, int k, const vector &masks, bool compactResult) { - - if (k == 2) { oclMat trainCollection; @@ -1455,50 +1019,34 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, if (query.empty() || train.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & train, float maxDistance, const oclMat & mask, - const oclMat & trainIdx, const oclMat & distance, const oclMat & nMatches); + // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int + int callType = query.depth(); + char cvFuncName[] = "radiusMatchSingle"; + if (callType != 5) + CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); - //#if 0 - static const caller_t callers[3][6] = + if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 + || callType != 2 || callType != 4))) { - { - ocl_matchL1_gpu, 0/*ocl_matchL1_gpu*/, - ocl_matchL1_gpu, ocl_matchL1_gpu, - ocl_matchL1_gpu, ocl_matchL1_gpu - }, - { - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, 0/*ocl_matchL2_gpu*/, - 0/*ocl_matchL2_gpu*/, ocl_matchL2_gpu - }, - { - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/, - ocl_matchHamming_gpu, 0/*ocl_matchHamming_gpu*/ - } - }; - //#endif - - const int nQuery = query.rows; - const int nTrain = train.rows; + CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); - CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); + CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size())); - nMatches.create(1, nQuery, CV_32SC1); + nMatches.create(1, query.rows, CV_32SC1); if (trainIdx.empty()) { - trainIdx.create(nQuery, std::max((nTrain / 100), 10), CV_32SC1); - distance.create(nQuery, std::max((nTrain / 100), 10), CV_32FC1); + trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1); + distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1); } nMatches.setTo(Scalar::all(0)); - caller_t func = callers[distType][query.depth()]; - //CV_Assert(func != 0); - //func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); - func(query, train, maxDistance, mask, trainIdx, distance, nMatches); + matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); +exit: + return; } void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, @@ -1697,5 +1245,3 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); } - - diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 4e0f5b8..1c727f0 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS //int flag = 0; oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1); - oclMat gsum(totalheight, gimg.cols + 1, CV_32SC1); - oclMat gsqsum(totalheight, gimg.cols + 1, CV_32FC1); + oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1); + oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1); //cl_mem cascadebuffer; cl_mem stagebuffer; diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 9679a7b..8028ca5 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; - if (!cv::ocl::Context::getContext()->impl->double_support && is_float) + if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float) { CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); } @@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) cv::Mat dst(dst_a); a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; - if (!cv::ocl::Context::getContext()->impl->double_support) + if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE)) { for (int i = 0; i < contour->total; ++i) { @@ -161,7 +161,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) a12 += dst.at(8, i); a03 += dst.at(9, i); } - } + } else { a00 = cv::sum(dst.row(0))[0]; diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 0730ac5..e76fb1d 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -5,19 +5,93 @@ int bit1Count(float x) { int c = 0; int ix = (int)x; - for (int i = 0 ; i < 32 ; i++) { c += ix & 0x1; ix >>= 1; } - return (float)c; } + +float reduce_block(__local float *s_query, + __local float *s_train, + int block_size, + int lidx, + int lidy, + int distType + ) +{ + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + float result = 0; + switch(distType) + { + case 0: + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + break; + case 1: + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + break; + case 2: + for (int j = 0 ; j < block_size ; j++) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); + } + break; + } + return result; +} + +float reduce_multi_block(__local float *s_query, + __local float *s_train, + int max_desc_len, + int block_size, + int block_index, + int lidx, + int lidy, + int distType + ) +{ + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + float result = 0; + switch(distType) + { + case 0: + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]); + } + break; + case 1: + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + break; + case 2: + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + } + break; + } + return result; +} + /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size local size: dim0 is block_size, dim1 is block_size. */ -__kernel void BruteForceMatch_UnrollMatch( +__kernel void BruteForceMatch_UnrollMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch( __local float *s_train = sharebuffer + block_size * max_desc_len; int queryIdx = groupidx * block_size + lidy; - // load the query into local memory. for (int i = 0 ; i < max_desc_len / block_size; i ++) { @@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) { //load a block_size * block_size block into local train. @@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch( } barrier(CLK_LOCAL_MEM_FENCE); - __local float *s_distance = (__local float *)(sharebuffer); - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local float *s_distance = (__local float*)(sharebuffer); + __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); //find BestMatch s_distance += lidy * block_size; @@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch( } } -__kernel void BruteForceMatch_Match( +__kernel void BruteForceMatch_Match_D5( __global float *query, __global float *train, //__global float *mask, @@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match( { //Dist dist; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) { const int loadx = lidx + i * block_size; @@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match( barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match( } //radius_unrollmatch -__kernel void BruteForceMatch_RadiusUnrollMatch( +__kernel void BruteForceMatch_RadiusUnrollMatch_D5( __global float *query, __global float *train, float maxDistance, @@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( __local float *s_train = sharebuffer + block_size * block_size; float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; ++i) { //load a block_size * block_size block into local train. @@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( { unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - if (ind < bestTrainIdx_cols) + if(ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; @@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( } //radius_match -__kernel void BruteForceMatch_RadiusMatch( +__kernel void BruteForceMatch_RadiusMatch_D5( __global float *query, __global float *train, float maxDistance, @@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch( __local float *s_train = sharebuffer + block_size * block_size; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) { //load a block_size * block_size block into local train. @@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + unsigned int ind = atom_inc(nMatches + queryIdx); - if (ind < bestTrainIdx_cols) + if(ind < bestTrainIdx_cols) { //bestImgIdx = imgIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; @@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch( } -__kernel void BruteForceMatch_knnUnrollMatch( +__kernel void BruteForceMatch_knnUnrollMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch( //loopUnrolledCached volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) { const int loadX = lidx + i * block_size; @@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } + result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch( for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; - if (val < bestDistance1) { bestDistance2 = bestDistance1; @@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( } } -__kernel void BruteForceMatch_knnMatch( +__kernel void BruteForceMatch_knnMatch_D5( __global float *query, __global float *train, //__global float *mask, @@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch( for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) { float result = 0.0f; - - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) { const int loadx = lidx + i * block_size; //load query and train into local memory @@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch( barrier(CLK_LOCAL_MEM_FENCE); - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } + result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch( for (int i = 0 ; i < block_size ; i++) { float val = s_distance[i]; - if (val < bestDistance1) { bestDistance2 = bestDistance1; @@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch( } } -kernel void BruteForceMatch_calcDistanceUnrolled( +kernel void BruteForceMatch_calcDistanceUnrolled_D5( __global float *query, __global float *train, //__global float *mask, @@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled( /* Todo */ } -kernel void BruteForceMatch_calcDistance( +kernel void BruteForceMatch_calcDistance_D5( __global float *query, __global float *train, //__global float *mask, @@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance( /* Todo */ } -kernel void BruteForceMatch_findBestMatch( +kernel void BruteForceMatch_findBestMatch_D5( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 2fa0906..9e468b0 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int4 data = *(__global int4*)&sum[glb_off]; int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); +#if OFF lcldata[lcl_off] = data.x; lcldata[lcl_off+1] = data.y; lcldata[lcl_off+2] = data.z; lcldata[lcl_off+3] = data.w; +#else + vstore4(data, 0, &lcldata[lcl_off]); +#endif } lcloutindex[lcl_id] = 0; @@ -559,3 +563,7 @@ if(result) } } */ + + + + diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp index bdf1f8a..424781f 100644 --- a/modules/ocl/test/test_brute_force_matcher.cpp +++ b/modules/ocl/test/test_brute_force_matcher.cpp @@ -110,7 +110,7 @@ namespace } }; - TEST_P(BruteForceMatcher, DISABLED_Match_Single) + TEST_P(BruteForceMatcher, Match_Single) { cv::ocl::BruteForceMatcher_OCL_base matcher(distType); @@ -130,7 +130,7 @@ namespace ASSERT_EQ(0, badCount); } - TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single) + TEST_P(BruteForceMatcher, KnnMatch_2_Single) { const int knn = 2; -- 2.7.4