From 6eefd276cf0b9f403bd963686772c5e1620bbb89 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:51:36 +0800 Subject: [PATCH] Further optimize bfmatcher by passing macros. --- modules/ocl/src/brute_force_matcher.cpp | 47 ++++++------ modules/ocl/src/opencl/brute_force_match.cl | 106 +++++++++------------------- 2 files changed, 60 insertions(+), 93 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index ee0989d..c314304 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -16,6 +16,7 @@ // // @Authors // Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,6 +62,8 @@ namespace cv } } +static const int OPT_SIZE = 100; + template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) @@ -74,9 +77,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -90,7 +93,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_UnrollMatch"; @@ -116,9 +118,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -132,7 +134,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_Match"; @@ -160,9 +161,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -180,7 +181,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; @@ -201,9 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -221,7 +221,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_RadiusMatch"; @@ -300,9 +299,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -316,7 +315,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_knnUnrollMatch"; @@ -335,9 +333,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -351,7 +349,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_knnMatch"; @@ -370,6 +367,8 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D distType=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -384,11 +383,10 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_calcDistanceUnrolled"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -402,6 +400,8 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask int block_size = BLOCK_SIZE; vector< pair > args; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D distType=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -415,11 +415,10 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_calcDistance"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -676,12 +675,14 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + const int nQuery = query.rows; ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx); ensureSizeIsEnough(1, nQuery, CV_32F, distance); + matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); exit: return; @@ -771,6 +772,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co const int nQuery = query.rows; const int nTrain = train.rows; + if (k == 2) { ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); @@ -1045,6 +1047,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const int nQuery = query.rows; const int nTrain = train.rows; + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size())); diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 7821920..4e069ef 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -66,37 +66,30 @@ int bit1Count(float x) return (float)c; } +#ifndef distType +#define distType 0 +#endif + +#if (distType == 0) +#define DIST(x, y) fabs((x) - (y)) +#elif (distType == 1) +#define DIST(x, y) (((x) - (y)) * ((x) - (y))) +#elif (distType == 2) +#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y)) +#endif + + float reduce_block(__local float *s_query, __local float *s_train, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < block_size ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * block_size + j], s_train[j * block_size + lidx]); } return result; } @@ -105,35 +98,14 @@ float reduce_multi_block(__local float *s_query, __local float *s_train, int block_index, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < block_size ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * max_desc_len + block_index * block_size + j], s_train[j * block_size + lidx]); } return result; } @@ -152,8 +124,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { @@ -191,7 +162,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -247,8 +218,7 @@ __kernel void BruteForceMatch_Match_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -283,7 +253,7 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -344,8 +314,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -371,7 +340,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -405,8 +374,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -432,7 +400,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -462,8 +430,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -501,7 +468,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -609,8 +576,7 @@ __kernel void BruteForceMatch_knnMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -645,7 +611,7 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -752,8 +718,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } @@ -768,8 +733,7 @@ kernel void BruteForceMatch_calcDistance_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } -- 2.7.4