m.create(rows, cols, type);
}
-
-template < int BLOCK_SIZE, int MAX_DESC_LEN >
-static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train,
- const UMat &trainIdx, const UMat &distance, int distType)
+static bool ocl_matchSingle(InputArray query, InputArray train,
+ UMat &trainIdx, UMat &distance, int distType)
{
- int depth = _query.depth();
- cv::String opts;
- opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
- ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
- ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
+ if (query.empty() || train.empty())
return false;
- size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, (void *)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, (int)query.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
+ const int query_rows = query.rows();
+ const int query_cols = query.cols();
-template < int BLOCK_SIZE >
-static bool ocl_match(InputArray _query, InputArray _train,
- const UMat &trainIdx, const UMat &distance, int distType)
-{
- int depth = _query.depth();
- cv::String opts;
- opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
- ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
- ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
- return false;
+ ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
+ ensureSizeIsEnough(1, query_rows, CV_32F, distance);
- size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+ ocl::Device devDef = ocl::Device::getDefault();
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, (void *)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, (int)query.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
+ UMat uquery = query.getUMat(), utrain = train.getUMat();
+ int kercn = 1;
+ if (devDef.isIntel() &&
+ (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
+ (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
+ kercn = 4;
-static bool ocl_matchDispatcher(InputArray query, InputArray train,
- const UMat &trainIdx, const UMat &distance, int distType)
-{
- int query_cols = query.size().width;
- bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+ int block_size = 16;
+ int max_desc_len = 0;
+ bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU;
if (query_cols <= 64)
- {
- if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false;
- }
+ max_desc_len = 64 / kercn;
else if (query_cols <= 128 && !is_cpu)
- {
- if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) return false;
- }
- else
- {
- if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false;
- }
- return true;
-}
+ max_desc_len = 128 / kercn;
-static bool ocl_matchSingle(InputArray query, InputArray train,
- UMat &trainIdx, UMat &distance, int dstType)
-{
- if (query.empty() || train.empty())
+ int depth = query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len);
+ ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
return false;
- int query_rows = query.size().height;
-
- ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
- ensureSizeIsEnough(1, query_rows, CV_32F, distance);
-
- return ocl_matchDispatcher(query, train, trainIdx, distance, dstType);
+ size_t globalSize[] = {(query.size().height + block_size - 1) / block_size * block_size, block_size};
+ size_t localSize[] = {block_size, block_size};
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, uquery.rows);
+ idx = k.set(idx, uquery.cols);
+ idx = k.set(idx, utrain.rows);
+ idx = k.set(idx, utrain.cols);
+ idx = k.set(idx, (int)(uquery.step / sizeof(float)));
+
+ return k.run(2, globalSize, localSize, false);
}
static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches)
return ocl_matchConvert(trainIdxCPU, distanceCPU, matches);
}
-template < int BLOCK_SIZE, int MAX_DESC_LEN >
-static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train,
- const UMat &trainIdx, const UMat &distance, int distType)
+static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
+ UMat &distance, int distType)
{
- int depth = _query.depth();
- cv::String opts;
- opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
- ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
- ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
+ if (query.empty() || train.empty())
return false;
- size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, (void *)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, (int)query.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
+ const int query_rows = query.rows();
+ const int query_cols = query.cols();
-template < int BLOCK_SIZE >
-static bool ocl_knn_match(InputArray _query, InputArray _train,
- const UMat &trainIdx, const UMat &distance, int distType)
-{
- int depth = _query.depth();
- cv::String opts;
- opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
- ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
- ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
- return false;
+ ensureSizeIsEnough(1, query_rows, CV_32SC2, trainIdx);
+ ensureSizeIsEnough(1, query_rows, CV_32FC2, distance);
- size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+ trainIdx.setTo(Scalar::all(-1));
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, (void*)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, (int)query.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
+ ocl::Device devDef = ocl::Device::getDefault();
-static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType)
-{
- bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
- if (query.size().width <= 64)
- {
- if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType))
- return false;
- }
- else if (query.size().width <= 128 && !is_cpu)
- {
- if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType))
- return false;
- }
- else
- {
- if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType))
- return false;
- }
- return true;
-}
+ UMat uquery = query.getUMat(), utrain = train.getUMat();
+ int kercn = 1;
+ if (devDef.isIntel() &&
+ (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
+ (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
+ kercn = 4;
-static bool ocl_kmatchDispatcher(InputArray query, InputArray train, const UMat &trainIdx,
- const UMat &distance, int distType)
-{
- return ocl_match2Dispatcher(query, train, trainIdx, distance, distType);
-}
+ int block_size = 16;
+ int max_desc_len = 0;
+ bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU;
+ if (query_cols <= 64)
+ max_desc_len = 64 / kercn;
+ else if (query_cols <= 128 && !is_cpu)
+ max_desc_len = 128 / kercn;
-static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
- UMat &distance, int dstType)
-{
- if (query.empty() || train.empty())
+ int depth = query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len);
+ ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
return false;
- const int nQuery = query.size().height;
-
- ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
- ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-
- trainIdx.setTo(Scalar::all(-1));
-
- return ocl_kmatchDispatcher(query, train, trainIdx, distance, dstType);
+ size_t globalSize[] = {(query_rows + block_size - 1) / block_size * block_size, block_size};
+ size_t localSize[] = {block_size, block_size};
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, uquery.rows);
+ idx = k.set(idx, uquery.cols);
+ idx = k.set(idx, utrain.rows);
+ idx = k.set(idx, utrain.cols);
+ idx = k.set(idx, (int)(uquery.step / sizeof(float)));
+
+ return k.run(2, globalSize, localSize, false);
}
static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
Mat distanceCPU = distance.getMat(ACCESS_READ);
- if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) )
- return true;
- return false;
-}
-
-template < int BLOCK_SIZE, int MAX_DESC_LEN >
-static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance,
- const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
-{
- int depth = _query.depth();
- cv::String opts;
- opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
- ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN);
- ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
- return false;
-
- size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, maxDistance);
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
- idx = k.set(idx, (void*)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, trainIdx.cols);
- idx = k.set(idx, (int)query.step);
- idx = k.set(idx, (int)trainIdx.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
-
-//radius_match
-template < int BLOCK_SIZE >
-static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance,
- const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
-{
- int depth = _query.depth();
- cv::String opts;
- opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
- ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
- if(k.empty())
- return false;
-
- size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
- size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
- if(globalSize[0] != 0)
- {
- UMat query = _query.getUMat(), train = _train.getUMat();
-
- int idx = 0;
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
- idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
- idx = k.set(idx, maxDistance);
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
- idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
- idx = k.set(idx, (void*)NULL, smemSize);
- idx = k.set(idx, query.rows);
- idx = k.set(idx, query.cols);
- idx = k.set(idx, train.rows);
- idx = k.set(idx, train.cols);
- idx = k.set(idx, trainIdx.cols);
- idx = k.set(idx, (int)query.step);
- idx = k.set(idx, (int)trainIdx.step);
-
- return k.run(2, globalSize, localSize, false);
- }
- return true;
-}
-
-static bool ocl_rmatchDispatcher(InputArray query, InputArray train,
- UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType)
-{
- bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
- int query_cols = query.size().width;
- if (query_cols <= 64)
- {
- if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
- }
- else if (query_cols <= 128 && !is_cpu)
- {
- if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
- }
- else
- {
- if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
- }
- return true;
+ return ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
}
-
static bool ocl_radiusMatchSingle(InputArray query, InputArray train,
UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType)
{
if (query.empty() || train.empty())
return false;
- const int nQuery = query.size().height;
- const int nTrain = train.size().height;
+ const int query_rows = query.rows();
+ const int train_rows = train.rows();
- ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
+ ensureSizeIsEnough(1, query_rows, CV_32SC1, nMatches);
if (trainIdx.empty())
{
- ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
- ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
+ ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32SC1, trainIdx);
+ ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32FC1, distance);
}
nMatches.setTo(Scalar::all(0));
- return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType);
+ ocl::Device devDef = ocl::Device::getDefault();
+ UMat uquery = query.getUMat(), utrain = train.getUMat();
+ int kercn = 1;
+ if (devDef.isIntel() &&
+ (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) &&
+ (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4)))
+ kercn = 4;
+
+ int block_size = 16;
+ int depth = query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+ ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size);
+ ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if (k.empty())
+ return false;
+
+ size_t globalSize[] = {(train_rows + block_size - 1) / block_size * block_size, (query_rows + block_size - 1) / block_size * block_size, 1};
+ size_t localSize[] = {block_size, block_size, 1};
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain));
+ idx = k.set(idx, maxDistance);
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
+ idx = k.set(idx, uquery.rows);
+ idx = k.set(idx, uquery.cols);
+ idx = k.set(idx, utrain.rows);
+ idx = k.set(idx, utrain.cols);
+ idx = k.set(idx, trainIdx.cols);
+ idx = k.set(idx, (int)(uquery.step / sizeof(float)));
+ idx = k.set(idx, (int)(trainIdx.step / sizeof(int)));
+
+ return k.run(2, globalSize, localSize, false);
}
static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches,
#define MAX_DESC_LEN 64
#endif
+#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)
+#ifndef SHARED_MEM_SZ
+# if (BLOCK_SIZE < MAX_DESC_LEN)
+# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
+# else
+# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
+# endif
+#endif
+
#ifndef DIST_TYPE
#define DIST_TYPE 2
#endif
// dirty fix for non-template support
-#if (DIST_TYPE == 2) // L1Dist
+#if (DIST_TYPE == 2) // L1Dist
# ifdef T_FLOAT
-# define DIST(x, y) fabs((x) - (y))
- typedef float value_type;
typedef float result_type;
+# if (8 == kercn)
+ typedef float8 value_type;
+# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
+# elif (4 == kercn)
+ typedef float4 value_type;
+# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
+# else
+ typedef float value_type;
+# define DIST(x, y) result += fabs((x) - (y))
+# endif
# else
-# define DIST(x, y) abs((x) - (y))
- typedef int value_type;
typedef int result_type;
+# if (8 == kercn)
+ typedef int8 value_type;
+# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
+# elif (4 == kercn)
+ typedef int4 value_type;
+# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
+# else
+ typedef int value_type;
+# define DIST(x, y) result += abs((x) - (y))
+# endif
# endif
-#define DIST_RES(x) (x)
+# define DIST_RES(x) (x)
#elif (DIST_TYPE == 4) // L2Dist
-#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
-typedef float value_type;
-typedef float result_type;
-#define DIST_RES(x) sqrt(x)
+ typedef float result_type;
+# if (8 == kercn)
+ typedef float8 value_type;
+# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
+# elif (4 == kercn)
+ typedef float4 value_type;
+# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}
+# else
+ typedef float value_type;
+# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}
+# endif
+# define DIST_RES(x) sqrt(x)
#elif (DIST_TYPE == 6) // Hamming
-//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-inline int bit1Count(int v)
-{
- v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
- v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
- return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
-}
-#define DIST(x, y) bit1Count( (x) ^ (y) )
-typedef int value_type;
-typedef int result_type;
-#define DIST_RES(x) (x)
+# if (8 == kercn)
+ typedef int8 value_type;
+# elif (4 == kercn)
+ typedef int4 value_type;
+# else
+ typedef int value_type;
+# endif
+ typedef int result_type;
+# define DIST(x, y) result += popcount( (x) ^ (y) )
+# define DIST_RES(x) (x)
#endif
inline result_type reduce_block(
#pragma unroll
for (int j = 0 ; j < BLOCK_SIZE ; j++)
{
- result += DIST(
- s_query[lidy * BLOCK_SIZE + j],
- s_train[j * BLOCK_SIZE + lidx]);
+ DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
}
return DIST_RES(result);
}
#pragma unroll
for (int j = 0 ; j < BLOCK_SIZE ; j++)
{
- result += DIST(
- s_query[lidy * BLOCK_SIZE + j],
- s_train[j * BLOCK_SIZE + lidx]);
+ DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
}
- return (result);
+ return result;
}
inline result_type reduce_multi_block(
#pragma unroll
for (int j = 0 ; j < BLOCK_SIZE ; j++)
{
- result += DIST(
- s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
- s_train[j * BLOCK_SIZE + lidx]);
+ DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
}
return result;
}
-/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
-local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
-*/
-__kernel void BruteForceMatch_UnrollMatch(
+__kernel void BruteForceMatch_Match(
__global T *query,
__global T *train,
- //__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
- __local float *sharebuffer,
int query_rows,
int query_cols,
int train_rows,
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
+ const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
+ const int queryOffset = min(queryIdx, query_rows - 1) * step;
+ __global TN *query_vec = (__global TN *)(query + queryOffset);
+ query_cols /= kercn;
+
+ __local float sharebuffer[SHARED_MEM_SZ];
__local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
- int queryIdx = groupidx * BLOCK_SIZE + lidy;
+#if 0 < MAX_DESC_LEN
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
// load the query into local memory.
#pragma unroll
- for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
{
- int loadx = lidx + i * BLOCK_SIZE;
- s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ const int loadx = mad24(BLOCK_SIZE, i, lidx);
+ s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
}
+#else
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
+#endif
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
{
result_type result = 0;
+
+ const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
+ __global TN *train_vec = (__global TN *)(train + trainOffset);
+#if 0 < MAX_DESC_LEN
#pragma unroll
- for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
{
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
- const int loadx = lidx + i * BLOCK_SIZE;
- s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ const int loadx = mad24(BLOCK_SIZE, i, lidx);
+ s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
}
-
- result = DIST_RES(result);
-
- int trainIdx = t * BLOCK_SIZE + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
- {
- myBestDistance = result;
- myBestTrainIdx = trainIdx;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- __local float *s_distance = (__local float*)(sharebuffer);
- __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
-
- //find BestMatch
- s_distance += lidy * BLOCK_SIZE;
- s_trainIdx += lidy * BLOCK_SIZE;
- s_distance[lidx] = myBestDistance;
- s_trainIdx[lidx] = myBestTrainIdx;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //reduce -- now all reduce implement in each threads.
- #pragma unroll
- for (int k = 0 ; k < BLOCK_SIZE; k++)
- {
- if (myBestDistance > s_distance[k])
+#else
+ for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
{
- myBestDistance = s_distance[k];
- myBestTrainIdx = s_trainIdx[k];
- }
- }
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = myBestTrainIdx;
- bestDistance[queryIdx] = myBestDistance;
- }
-}
-
-__kernel void BruteForceMatch_Match(
- __global T *query,
- __global T *train,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __local float *sharebuffer,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- const int queryIdx = groupidx * BLOCK_SIZE + lidy;
-
- float myBestDistance = MAX_FLOAT;
- int myBestTrainIdx = -1;
-
- __local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
- // loop
- for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
- {
- result_type result = 0;
- for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
- {
- const int loadx = lidx + i * BLOCK_SIZE;
+ const int loadx = mad24(i, BLOCK_SIZE, lidx);
//load query and train into local memory
- s_query[lidy * BLOCK_SIZE + lidx] = 0;
- s_train[lidx * BLOCK_SIZE + lidy] = 0;
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
if (loadx < query_cols)
{
- s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
- s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
}
-
+#endif
result = DIST_RES(result);
- const int trainIdx = t * BLOCK_SIZE + lidx;
+ const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
- __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+ __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
//findBestMatch
- s_distance += lidy * BLOCK_SIZE;
- s_trainIdx += lidy * BLOCK_SIZE;
+ s_distance += lidy * BLOCK_SIZE_ODD;
+ s_trainIdx += lidy * BLOCK_SIZE_ODD;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
+ #pragma unroll
for (int k = 0 ; k < BLOCK_SIZE; k++)
{
if (myBestDistance > s_distance[k])
}
}
-//radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
- __global T *query,
- __global T *train,
- float maxDistance,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __global int *nMatches,
- __local float *sharebuffer,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int bestTrainIdx_cols,
- int step,
- int ostep
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
- const int groupidy = get_group_id(1);
-
- const int queryIdx = groupidy * BLOCK_SIZE + lidy;
- const int trainIdx = groupidx * BLOCK_SIZE + lidx;
-
- __local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
- result_type result = 0;
- for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
- {
- //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
- const int loadx = lidx + i * BLOCK_SIZE;
-
- s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
- //synchronize to make sure each elem for reduceIteration in share memory is written already.
- barrier(CLK_LOCAL_MEM_FENCE);
-
- result += reduce_block(s_query, s_train, lidx, lidy);
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if (queryIdx < query_rows && trainIdx < train_rows &&
- convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
- {
- int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
- if(ind < bestTrainIdx_cols)
- {
- bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
- bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
- }
- }
-}
-
//radius_match
__kernel void BruteForceMatch_RadiusMatch(
__global T *query,
__global T *train,
float maxDistance,
- //__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
- __local float *sharebuffer,
int query_rows,
int query_cols,
int train_rows,
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
- const int queryIdx = groupidy * BLOCK_SIZE + lidy;
- const int trainIdx = groupidx * BLOCK_SIZE + lidx;
+ const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
+ const int queryOffset = min(queryIdx, query_rows - 1) * step;
+ __global TN *query_vec = (__global TN *)(query + queryOffset);
+
+ const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
+ const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
+ __global TN *train_vec = (__global TN *)(train + trainOffset);
+ query_cols /= kercn;
+
+ __local float sharebuffer[SHARED_MEM_SZ];
__local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
result_type result = 0;
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
{
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
- const int loadx = lidx + i * BLOCK_SIZE;
+ const int loadx = mad24(BLOCK_SIZE, i, lidx);
- s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
+
+ if (loadx < query_cols)
+ {
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
+ }
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
}
-
- if (queryIdx < query_rows && trainIdx < train_rows &&
- convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
+ if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
{
int ind = atom_inc(nMatches + queryIdx);
if(ind < bestTrainIdx_cols)
{
- bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
- bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+ bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
+ bestDistance[mad24(queryIdx, ostep, ind)] = result;
}
}
}
-
-__kernel void BruteForceMatch_knnUnrollMatch(
+__kernel void BruteForceMatch_knnMatch(
__global T *query,
__global T *train,
- //__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
- __local float *sharebuffer,
int query_rows,
int query_cols,
int train_rows,
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
- const int queryIdx = groupidx * BLOCK_SIZE + lidy;
+ const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
+ const int queryOffset = min(queryIdx, query_rows - 1) * step;
+ __global TN *query_vec = (__global TN *)(query + queryOffset);
+ query_cols /= kercn;
+
+ __local float sharebuffer[SHARED_MEM_SZ];
__local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
+#if 0 < MAX_DESC_LEN
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
// load the query into local memory.
+ #pragma unroll
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
{
- int loadx = lidx + i * BLOCK_SIZE;
- s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ int loadx = mad24(BLOCK_SIZE, i, lidx);
+ s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
}
+#else
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
+#endif
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
- //loopUnrolledCached
- for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
+ for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
{
result_type result = 0;
+
+ int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
+ __global TN *train_vec = (__global TN *)(train + trainOffset);
+#if 0 < MAX_DESC_LEN
+ #pragma unroll
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
{
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
- const int loadx = lidx + i * BLOCK_SIZE;
- s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ const int loadx = mad24(BLOCK_SIZE, i, lidx);
+ s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
}
-
- result = DIST_RES(result);
-
- const int trainIdx = t * BLOCK_SIZE + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows)
- {
- if (result < myBestDistance1)
- {
- myBestDistance2 = myBestDistance1;
- myBestTrainIdx2 = myBestTrainIdx1;
- myBestDistance1 = result;
- myBestTrainIdx1 = trainIdx;
- }
- else if (result < myBestDistance2)
- {
- myBestDistance2 = result;
- myBestTrainIdx2 = trainIdx;
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- __local float *s_distance = (local float *)sharebuffer;
- __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
-
- // find BestMatch
- s_distance += lidy * BLOCK_SIZE;
- s_trainIdx += lidy * BLOCK_SIZE;
-
- s_distance[lidx] = myBestDistance1;
- s_trainIdx[lidx] = myBestTrainIdx1;
-
- float bestDistance1 = MAX_FLOAT;
- float bestDistance2 = MAX_FLOAT;
- int bestTrainIdx1 = -1;
- int bestTrainIdx2 = -1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < BLOCK_SIZE ; i++)
- {
- float val = s_distance[i];
- if (val < bestDistance1)
- {
- bestDistance2 = bestDistance1;
- bestTrainIdx2 = bestTrainIdx1;
-
- bestDistance1 = val;
- bestTrainIdx1 = s_trainIdx[i];
- }
- else if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- s_distance[lidx] = myBestDistance2;
- s_trainIdx[lidx] = myBestTrainIdx2;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < BLOCK_SIZE ; i++)
- {
- float val = s_distance[i];
-
- if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- myBestDistance1 = bestDistance1;
- myBestDistance2 = bestDistance2;
-
- myBestTrainIdx1 = bestTrainIdx1;
- myBestTrainIdx2 = bestTrainIdx2;
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
- bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
- }
-}
-
-__kernel void BruteForceMatch_knnMatch(
- __global T *query,
- __global T *train,
- //__global float *mask,
- __global int2 *bestTrainIdx,
- __global float2 *bestDistance,
- __local float *sharebuffer,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- const int queryIdx = groupidx * BLOCK_SIZE + lidy;
- __local value_type *s_query = (__local value_type *)sharebuffer;
- __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
- float myBestDistance1 = MAX_FLOAT;
- float myBestDistance2 = MAX_FLOAT;
- int myBestTrainIdx1 = -1;
- int myBestTrainIdx2 = -1;
-
- //loop
- for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
- {
- result_type result = 0.0f;
- for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
+#else
+ for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
{
- const int loadx = lidx + i * BLOCK_SIZE;
+ const int loadx = mad24(BLOCK_SIZE, i, lidx);
//load query and train into local memory
- s_query[lidy * BLOCK_SIZE + lidx] = 0;
- s_train[lidx * BLOCK_SIZE + lidy] = 0;
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
if (loadx < query_cols)
{
- s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
- s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+ s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
}
-
+#endif
result = DIST_RES(result);
- const int trainIdx = t * BLOCK_SIZE + lidx;
+ const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
- if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+ if (queryIdx < query_rows && trainIdx < train_rows)
{
if (result < myBestDistance1)
{
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
- __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
-
- //findBestMatch
- s_distance += lidy * BLOCK_SIZE;
- s_trainIdx += lidy * BLOCK_SIZE;
+ __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
+ // find BestMatch
+ s_distance += lidy * BLOCK_SIZE_ODD;
+ s_trainIdx += lidy * BLOCK_SIZE_ODD;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
-}
-
-kernel void BruteForceMatch_calcDistanceUnrolled(
- __global T *query,
- __global T *train,
- //__global float *mask,
- __global float *allDist,
- __local float *sharebuffer,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step)
-{
- /* Todo */
-}
-
-kernel void BruteForceMatch_calcDistance(
- __global T *query,
- __global T *train,
- //__global float *mask,
- __global float *allDist,
- __local float *sharebuffer,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step)
-{
- /* Todo */
-}
-
-kernel void BruteForceMatch_findBestMatch(
- __global float *allDist,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- int k
-)
-{
- /* Todo */
-}
+}
\ No newline at end of file