#include "precomp.hpp"
#include <limits>
+#include "opencl_kernels.hpp"
#if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 2
#include <Eigen/Array>
return mask;
}
+//////////////////////////////////////////////////////////////////ocl functions for BFMatcher ///////////////////////////////////////////////////////////////
+
+static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m)
+{
+ if (m.type() == type && m.rows >= rows && m.cols >= cols)
+ m = m(Rect(0, 0, cols, rows));
+ else
+ m.create(rows, cols, type);
+}
+
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
+static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train,
+ const UMat &trainIdx, const UMat &distance, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
+ ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, (void *)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+template < int BLOCK_SIZE/*, typename Mask*/ >
+static bool ocl_match(InputArray _query, InputArray _train,
+ const UMat &trainIdx, const UMat &distance, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+ ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, (void *)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+static bool ocl_matchDispatcher(InputArray query, InputArray train,
+ const UMat &trainIdx, const UMat &distance, int distType)
+{
+ int query_cols = query.size().width;
+ bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+ if (query_cols <= 64)
+ {
+ if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false;
+ }
+ else if (query_cols <= 128 && !is_cpu)
+ {
+ if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) return false;
+ }
+ else
+ {
+ if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false;
+ }
+ return true;
+}
+
+static bool ocl_matchSingle(InputArray query, InputArray train,
+ UMat &trainIdx, UMat &distance, int dstType)
+{
+ if (query.empty() || train.empty())
+ return false;
+
+ int query_rows = query.size().height;
+
+ ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
+ ensureSizeIsEnough(1, query_rows, CV_32F, distance);
+
+ return ocl_matchDispatcher(query, train, trainIdx, distance, dstType);
+}
+
+static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches)
+{
+ if (trainIdx.empty() || distance.empty())
+ return false;
+
+ if( (trainIdx.type() != CV_32SC1) || (distance.type() != CV_32FC1 || distance.cols != trainIdx.cols) )
+ return false;
+
+ const int nQuery = trainIdx.cols;
+
+ matches.clear();
+ matches.reserve(nQuery);
+
+ const int *trainIdx_ptr = trainIdx.ptr<int>();
+ const float *distance_ptr = distance.ptr<float>();
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
+ {
+ int trainIndex = *trainIdx_ptr;
+
+ if (trainIndex == -1)
+ continue;
+
+ float dst = *distance_ptr;
+
+ DMatch m(queryIdx, trainIndex, 0, dst);
+
+ std::vector<DMatch> temp;
+ temp.push_back(m);
+ matches.push_back(temp);
+ }
+ return true;
+}
+
+static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector<DMatch> > &matches)
+{
+ if (trainIdx.empty() || distance.empty())
+ return false;
+
+ Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+ Mat distanceCPU = distance.getMat(ACCESS_READ);
+
+ return ocl_matchConvert(trainIdxCPU, distanceCPU, matches);
+}
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
+static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train,
+ const UMat &trainIdx, const UMat &distance, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
+ ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, (void *)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+template < int BLOCK_SIZE/*, typename Mask*/ >
+static bool ocl_knn_match(InputArray _query, InputArray _train,
+ const UMat &trainIdx, const UMat &distance, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+ ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, (void*)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType)
+{
+ bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+ if (query.size().width <= 64)
+ {
+ if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType))
+ return false;
+ }
+ else if (query.size().width <= 128 && !is_cpu)
+ {
+ if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType))
+ return false;
+ }
+ else
+ {
+ if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType))
+ return false;
+ }
+ return true;
+}
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
+static bool ocl_calcDistanceUnrolled(InputArray _query, InputArray _train, const UMat &allDist, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN);
+ ocl::Kernel k("BruteForceMatch_calcDistanceUnrolled", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().width + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(allDist));
+ idx = k.set(idx, (void*)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ k.run(2, globalSize, localSize, false);
+ }
+ return false;// TODO in KERNEL
+}
+
+template < int BLOCK_SIZE/*, typename Mask*/ >
+static bool ocl_calcDistance(InputArray _query, InputArray _train, const UMat &allDist, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+ ocl::Kernel k("BruteForceMatch_calcDistance", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_query.size().width + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(allDist));
+ idx = k.set(idx, (void*)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, (int)query.step);
+
+ k.run(2, globalSize, localSize, false);
+ }
+ return false;// TODO in KERNEL
+}
+
+static bool ocl_calcDistanceDispatcher(InputArray query, InputArray train, const UMat &allDist, int distType)
+{
+ if (query.size().width <= 64)
+ {
+ if(!ocl_calcDistanceUnrolled<16, 64>(query, train, allDist, distType)) return false;
+ }
+ else if (query.size().width <= 128)
+ {
+ if(!ocl_calcDistanceUnrolled<16, 128>(query, train, allDist, distType)) return false;
+ }
+ else
+ {
+ if(!ocl_calcDistance<16>(query, train, allDist, distType)) return false;
+ }
+ return true;
+}
+
+template <int BLOCK_SIZE>
+static bool ocl_findKnnMatch(int k, const UMat &trainIdx, const UMat &distance, const UMat &allDist, int /*distType*/)
+{
+ return false;// TODO in KERNEL
+
+ std::vector<ocl::Kernel> kernels;
+ for (int i = 0; i < k; ++i)
+ {
+ ocl::Kernel kernel("BruteForceMatch_findBestMatch", ocl::features2d::brute_force_match_oclsrc);
+ if(kernel.empty())
+ return false;
+ kernels.push_back(kernel);
+ }
+
+ size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1};
+ size_t localSize[] = {BLOCK_SIZE, 1, 1};
+ int block_size = BLOCK_SIZE;
+
+ for (int i = 0; i < k; ++i)
+ {
+ int idx = 0;
+ idx = kernels[i].set(idx, ocl::KernelArg::PtrReadOnly(allDist));
+ idx = kernels[i].set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = kernels[i].set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = kernels[i].set(idx, i);
+ idx = kernels[i].set(idx, block_size);
+// idx = kernels[i].set(idx, train.rows);
+// idx = kernels[i].set(idx, train.cols);
+// idx = kernels[i].set(idx, query.step);
+
+ if(!kernels[i].run(2, globalSize, localSize, false))
+ return false;
+ }
+ return true;
+}
+
+static bool ocl_findKnnMatchDispatcher(int k, const UMat &trainIdx, const UMat &distance, const UMat &allDist, int distType)
+{
+ return ocl_findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
+}
+
+static bool ocl_kmatchDispatcher(InputArray query, InputArray train, int k, const UMat &trainIdx,
+ const UMat &distance, const UMat &allDist, int distType)
+{
+ if(k == 2)
+ {
+ if( !ocl_match2Dispatcher(query, train, trainIdx, distance, distType) ) return false;
+ }
+ else
+ {
+ if( !ocl_calcDistanceDispatcher(query, train, allDist, distType) ) return false;
+ if( !ocl_findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType) ) return false;
+ }
+ return true;
+}
+
+static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
+ UMat &distance, UMat &allDist, int k, int dstType)
+{
+ if (query.empty() || train.empty())
+ return false;
+
+ const int nQuery = query.size().height;
+ const int nTrain = train.size().height;
+
+ if (k == 2)
+ {
+ ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
+ ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
+ }
+ else
+ {
+ ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
+ ensureSizeIsEnough(nQuery, k, CV_32F, distance);
+ ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
+ }
+
+ trainIdx.setTo(Scalar::all(-1));
+
+ return ocl_kmatchDispatcher(query, train, k, trainIdx, distance, allDist, dstType);
+}
+
+static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+ if (trainIdx.empty() || distance.empty())
+ return false;
+
+ if(trainIdx.type() != CV_32SC2 && trainIdx.type() != CV_32SC1) return false;
+ if(distance.type() != CV_32FC2 && distance.type() != CV_32FC1)return false;
+ if(distance.size() != trainIdx.size()) return false;
+ if(!trainIdx.isContinuous() || !distance.isContinuous()) return false;
+
+ const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;
+ const int k = trainIdx.type() == CV_32SC2 ? 2 : trainIdx.cols;
+
+ matches.clear();
+ matches.reserve(nQuery);
+
+ const int *trainIdx_ptr = trainIdx.ptr<int>();
+ const float *distance_ptr = distance.ptr<float>();
+
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+ {
+ matches.push_back(std::vector<DMatch>());
+ std::vector<DMatch> &curMatches = matches.back();
+ curMatches.reserve(k);
+
+ for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
+ {
+ int trainIndex = *trainIdx_ptr;
+
+ if (trainIndex != -1)
+ {
+ float dst = *distance_ptr;
+
+ DMatch m(queryIdx, trainIndex, 0, dst);
+
+ curMatches.push_back(m);
+ }
+ }
+
+ if (compactResult && curMatches.empty())
+ matches.pop_back();
+ }
+ return true;
+}
+
+static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+ if (trainIdx.empty() || distance.empty())
+ return false;
+
+ Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+ Mat distanceCPU = distance.getMat(ACCESS_READ);
+
+ if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) )
+ return true;
+ return false;
+}
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
+static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance,
+ const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+ ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN);
+ ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, maxDistance);
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
+ idx = k.set(idx, (void*)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, trainIdx.cols);
+ idx = k.set(idx, (int)query.step);
+ idx = k.set(idx, (int)trainIdx.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+//radius_match
+template < int BLOCK_SIZE/*, typename Mask*/ >
+static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance,
+ const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
+{
+ int depth = _query.depth();
+ cv::String opts;
+ opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+ ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+ if(k.empty())
+ return false;
+
+ size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
+ size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+ if(globalSize[0] != 0)
+ {
+ UMat query = _query.getUMat(), train = _train.getUMat();
+
+ int idx = 0;
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+ idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+ idx = k.set(idx, maxDistance);
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+ idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
+ idx = k.set(idx, (void*)NULL, smemSize);
+ idx = k.set(idx, query.rows);
+ idx = k.set(idx, query.cols);
+ idx = k.set(idx, train.rows);
+ idx = k.set(idx, train.cols);
+ idx = k.set(idx, trainIdx.cols);
+ idx = k.set(idx, (int)query.step);
+ idx = k.set(idx, (int)trainIdx.step);
+
+ return k.run(2, globalSize, localSize, false);
+ }
+ return true;
+}
+
+static bool ocl_rmatchDispatcher(InputArray query, InputArray train,
+ UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType)
+{
+ bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+ int query_cols = query.size().width;
+ if (query_cols <= 64)
+ {
+ if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+ }
+ else if (query_cols <= 128 && !is_cpu)
+ {
+ if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+ }
+ else
+ {
+ if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+ }
+ return true;
+}
+
+
+static bool ocl_radiusMatchSingle(InputArray query, InputArray train,
+ UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType)
+{
+ if (query.empty() || train.empty())
+ return false;
+
+ const int nQuery = query.size().height;
+ const int nTrain = train.size().height;
+
+ ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
+
+ if (trainIdx.empty())
+ {
+ ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
+ ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
+ }
+
+ nMatches.setTo(Scalar::all(0));
+
+ return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType);
+}
+
+static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches,
+ std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+ if (trainIdx.empty() || distance.empty() || _nMatches.empty())
+ return false;
+
+ if( (trainIdx.type() != CV_32SC1) ||
+ (distance.type() != CV_32FC1 || distance.size() != trainIdx.size()) ||
+ (_nMatches.type() != CV_32SC1 || _nMatches.cols != trainIdx.rows) )
+ return false;
+
+ const int nQuery = trainIdx.rows;
+
+ matches.clear();
+ matches.reserve(nQuery);
+
+ const int *nMatches_ptr = _nMatches.ptr<int>();
+
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+ {
+ const int *trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
+ const float *distance_ptr = distance.ptr<float>(queryIdx);
+
+ const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+
+ if (nMatches == 0)
+ {
+ if (!compactResult)
+ matches.push_back(std::vector<DMatch>());
+ continue;
+ }
+
+ matches.push_back(std::vector<DMatch>(nMatches));
+ std::vector<DMatch> &curMatches = matches.back();
+
+ for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+ {
+ int trainIndex = *trainIdx_ptr;
+
+ float dst = *distance_ptr;
+
+ DMatch m(queryIdx, trainIndex, 0, dst);
+
+ curMatches[i] = m;
+ }
+
+ std::sort(curMatches.begin(), curMatches.end());
+ }
+ return true;
+}
+
+static bool ocl_radiusMatchDownload(const UMat &trainIdx, const UMat &distance, const UMat &nMatches,
+ std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+ if (trainIdx.empty() || distance.empty() || nMatches.empty())
+ return false;
+
+ Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+ Mat distanceCPU = distance.getMat(ACCESS_READ);
+ Mat nMatchesCPU = nMatches.getMat(ACCESS_READ);
+
+ return ocl_radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
+}
+
/****************************************************************************************\
* DescriptorMatcher *
\****************************************************************************************/
DescriptorMatcher::~DescriptorMatcher()
{}
-void DescriptorMatcher::add( const std::vector<Mat>& descriptors )
+void DescriptorMatcher::add( InputArrayOfArrays _descriptors )
{
- trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+ if(_descriptors.isUMatVector())
+ {
+ std::vector<UMat> descriptors;
+ _descriptors.getUMatVector(descriptors);
+ utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() );
+ }
+ else if(_descriptors.isUMat())
+ {
+ std::vector<UMat> descriptors = std::vector<UMat>(1, _descriptors.getUMat());
+ utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() );
+ }
+ else if(_descriptors.isMatVector())
+ {
+ std::vector<Mat> descriptors;
+ _descriptors.getMatVector(descriptors);
+ trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+ }
+ else if(_descriptors.isMat())
+ {
+ std::vector<Mat> descriptors = std::vector<Mat>(1, _descriptors.getMat());
+ trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+ }
+ else
+ CV_Assert( _descriptors.isUMat() || _descriptors.isUMatVector() || _descriptors.isMat() || _descriptors.isMatVector() );
}
const std::vector<Mat>& DescriptorMatcher::getTrainDescriptors() const
void DescriptorMatcher::clear()
{
+ utrainDescCollection.clear();
trainDescCollection.clear();
}
bool DescriptorMatcher::empty() const
{
- return trainDescCollection.empty();
+ return trainDescCollection.empty() && utrainDescCollection.empty();
}
void DescriptorMatcher::train()
{}
-void DescriptorMatcher::match( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<DMatch>& matches, const Mat& mask ) const
+void DescriptorMatcher::match( InputArray queryDescriptors, InputArray trainDescriptors,
+ std::vector<DMatch>& matches, InputArray mask ) const
{
Ptr<DescriptorMatcher> tempMatcher = clone(true);
- tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
- tempMatcher->match( queryDescriptors, matches, std::vector<Mat>(1, mask) );
+ tempMatcher->add(trainDescriptors);
+ tempMatcher->match( queryDescriptors, matches, std::vector<Mat>(1, mask.getMat()) );
}
-void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
- const Mat& mask, bool compactResult ) const
+void DescriptorMatcher::knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+ std::vector<std::vector<DMatch> >& matches, int knn,
+ InputArray mask, bool compactResult ) const
{
Ptr<DescriptorMatcher> tempMatcher = clone(true);
- tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
- tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector<Mat>(1, mask), compactResult );
+ tempMatcher->add(trainDescriptors);
+ tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector<Mat>(1, mask.getMat()), compactResult );
}
-void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
- const Mat& mask, bool compactResult ) const
+void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+ std::vector<std::vector<DMatch> >& matches, float maxDistance, InputArray mask,
+ bool compactResult ) const
{
Ptr<DescriptorMatcher> tempMatcher = clone(true);
- tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
- tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector<Mat>(1, mask), compactResult );
+ tempMatcher->add(trainDescriptors);
+ tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector<Mat>(1, mask.getMat()), compactResult );
}
-void DescriptorMatcher::match( const Mat& queryDescriptors, std::vector<DMatch>& matches, const std::vector<Mat>& masks )
+void DescriptorMatcher::match( InputArray queryDescriptors, std::vector<DMatch>& matches, const std::vector<Mat>& masks )
{
std::vector<std::vector<DMatch> > knnMatches;
knnMatch( queryDescriptors, knnMatches, 1, masks, true /*compactResult*/ );
if( isMaskSupported() && !masks.empty() )
{
// Check masks
- size_t imageCount = trainDescCollection.size();
+ size_t imageCount = std::max(trainDescCollection.size(), utrainDescCollection.size() );
CV_Assert( masks.size() == imageCount );
for( size_t i = 0; i < imageCount; i++ )
{
- if( !masks[i].empty() && !trainDescCollection[i].empty() )
+ if( !masks[i].empty() && (!trainDescCollection[i].empty() || !utrainDescCollection[i].empty() ) )
{
+ int rows = trainDescCollection[i].empty() ? utrainDescCollection[i].rows : trainDescCollection[i].rows;
CV_Assert( masks[i].rows == queryDescriptorsCount &&
- masks[i].cols == trainDescCollection[i].rows &&
- masks[i].type() == CV_8UC1 );
+ (masks[i].cols == rows || masks[i].cols == rows) &&
+ masks[i].type() == CV_8UC1 );
}
}
}
}
-void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+void DescriptorMatcher::knnMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
const std::vector<Mat>& masks, bool compactResult )
{
- matches.clear();
if( empty() || queryDescriptors.empty() )
return;
CV_Assert( knn > 0 );
- checkMasks( masks, queryDescriptors.rows );
+ checkMasks( masks, queryDescriptors.size().height );
train();
knnMatchImpl( queryDescriptors, matches, knn, masks, compactResult );
}
-void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
const std::vector<Mat>& masks, bool compactResult )
{
matches.clear();
CV_Assert( maxDistance > std::numeric_limits<float>::epsilon() );
- checkMasks( masks, queryDescriptors.rows );
+ checkMasks( masks, queryDescriptors.size().height );
train();
radiusMatchImpl( queryDescriptors, matches, maxDistance, masks, compactResult );
}
-///////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////// BruteForceMatcher /////////////////////////////////////////////////
BFMatcher::BFMatcher( int _normType, bool _crossCheck )
{
return matcher;
}
+bool BFMatcher::ocl_match(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches, int dstType)
+{
+ UMat trainIdx, distance;
+ if(!ocl_matchSingle(query, _train, trainIdx, distance, dstType)) return false;
+ if(!ocl_matchDownload(trainIdx, distance, matches)) return false;
+ return true;
+}
+
+bool BFMatcher::ocl_knnMatch(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches, int k, int dstType, bool compactResult)
+{
+ UMat trainIdx, distance, allDist;
+ if (!ocl_knnMatchSingle(query, _train, trainIdx, distance, allDist, k, dstType)) return false;
+ if( !ocl_knnMatchDownload(trainIdx, distance, matches, compactResult) ) return false;
+ return true;
+}
-void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
- const std::vector<Mat>& masks, bool compactResult )
+void BFMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+ InputArrayOfArrays _masks, bool compactResult )
{
+ int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type();
+ CV_Assert( _queryDescriptors.type() == trainDescType );
+
const int IMGIDX_SHIFT = 18;
const int IMGIDX_ONE = (1 << IMGIDX_SHIFT);
- if( queryDescriptors.empty() || trainDescCollection.empty() )
+ if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty()))
{
matches.clear();
return;
}
- CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() );
+
+ std::vector<Mat> masks;
+ _masks.getMatVector(masks);
+
+ if(!trainDescCollection.empty() && !utrainDescCollection.empty())
+ {
+ for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+ {
+ Mat tempMat;
+ utrainDescCollection[i].copyTo(tempMat);
+ trainDescCollection.push_back(tempMat);
+ }
+ utrainDescCollection.clear();
+ }
+
+ int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size();
+ Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size();
+
+ if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 &&
+ _queryDescriptors.type() == CV_32FC1 &&
+ trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 )
+ {
+ if(knn == 1)
+ {
+ if(trainDescCollection.empty())
+ {
+ if(ocl_match(_queryDescriptors, utrainDescCollection[0], matches, normType))
+ return;
+ }
+ else
+ {
+ if(ocl_match(_queryDescriptors, trainDescCollection[0], matches, normType))
+ return;
+ }
+ }
+ else
+ {
+ if(trainDescCollection.empty())
+ {
+ if(ocl_knnMatch(_queryDescriptors, utrainDescCollection[0], matches, knn, normType, compactResult) )
+ return;
+ }
+ else
+ {
+ if(ocl_knnMatch(_queryDescriptors, trainDescCollection[0], matches, knn, normType, compactResult) )
+ return;
+ }
+ }
+ }
+
+ Mat queryDescriptors = _queryDescriptors.getMat();
+ if(trainDescCollection.empty() && !utrainDescCollection.empty())
+ {
+ for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+ {
+ Mat tempMat;
+ utrainDescCollection[i].copyTo(tempMat);
+ trainDescCollection.push_back(tempMat);
+ }
+ utrainDescCollection.clear();
+ }
matches.reserve(queryDescriptors.rows);
}
}
+bool BFMatcher::ocl_radiusMatch(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches,
+ float maxDistance, int dstType, bool compactResult)
+{
+ UMat trainIdx, distance, nMatches;
+ if(!ocl_radiusMatchSingle(query, _train, trainIdx, distance, nMatches, maxDistance, dstType)) return false;
+ if(!ocl_radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult)) return false;
+ return true;
+}
-void BFMatcher::radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches,
- float maxDistance, const std::vector<Mat>& masks, bool compactResult )
+void BFMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches,
+ float maxDistance, InputArrayOfArrays _masks, bool compactResult )
{
- if( queryDescriptors.empty() || trainDescCollection.empty() )
+ int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type();
+ CV_Assert( _queryDescriptors.type() == trainDescType );
+
+ if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty()))
{
matches.clear();
return;
}
- CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() );
+
+ std::vector<Mat> masks;
+ _masks.getMatVector(masks);
+
+ if(!trainDescCollection.empty() && !utrainDescCollection.empty())
+ {
+ for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+ {
+ Mat tempMat;
+ utrainDescCollection[i].copyTo(tempMat);
+ trainDescCollection.push_back(tempMat);
+ }
+ utrainDescCollection.clear();
+ }
+
+ int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size();
+ Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size();
+
+ if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 &&
+ _queryDescriptors.type() == CV_32FC1 &&
+ trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 )
+ {
+ if(trainDescCollection.empty())
+ {
+ if(ocl_radiusMatch(_queryDescriptors, utrainDescCollection[0], matches, maxDistance, normType, compactResult) )
+ return;
+ }
+ else
+ {
+ if(ocl_radiusMatch(_queryDescriptors, trainDescCollection[0], matches, maxDistance, normType, compactResult) )
+ return;
+ }
+ }
+
+ Mat queryDescriptors = _queryDescriptors.getMat();
+ if(trainDescCollection.empty() && !utrainDescCollection.empty())
+ {
+ for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+ {
+ Mat tempMat;
+ utrainDescCollection[i].copyTo(tempMat);
+ trainDescCollection.push_back(tempMat);
+ }
+ utrainDescCollection.clear();
+ }
matches.resize(queryDescriptors.rows);
Mat dist, distf;
}
}
-void FlannBasedMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
- const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+void FlannBasedMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+ InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
{
+ Mat queryDescriptors = _queryDescriptors.getMat();
Mat indices( queryDescriptors.rows, knn, CV_32SC1 );
Mat dists( queryDescriptors.rows, knn, CV_32FC1);
flannIndex->knnSearch( queryDescriptors, indices, dists, knn, *searchParams );
convertToDMatches( mergedDescriptors, indices, dists, matches );
}
-void FlannBasedMatcher::radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
- const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+void FlannBasedMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+ InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
{
+ Mat queryDescriptors = _queryDescriptors.getMat();
const int count = mergedDescriptors.size(); // TODO do count as param?
Mat indices( queryDescriptors.rows, count, CV_32SC1, Scalar::all(-1) );
Mat dists( queryDescriptors.rows, count, CV_32FC1, Scalar::all(-1) );
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Nathan, liujun@multicorewareinc.com
+// Peng Xiao, pengxiao@outlook.com
+// Baichuan Su, baichuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#define MAX_FLOAT 3.40282e+038f
+
+#ifndef T
+#define T float
+#endif
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE 16
+#endif
+#ifndef MAX_DESC_LEN
+#define MAX_DESC_LEN 64
+#endif
+
+#ifndef DIST_TYPE
+#define DIST_TYPE 2
+#endif
+
+// dirty fix for non-template support
+#if (DIST_TYPE == 2) // L1Dist
+# ifdef T_FLOAT
+# define DIST(x, y) fabs((x) - (y))
+ typedef float value_type;
+ typedef float result_type;
+# else
+# define DIST(x, y) abs((x) - (y))
+ typedef int value_type;
+ typedef int result_type;
+# endif
+#define DIST_RES(x) (x)
+#elif (DIST_TYPE == 4) // L2Dist
+#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
+typedef float value_type;
+typedef float result_type;
+#define DIST_RES(x) sqrt(x)
+#elif (DIST_TYPE == 6) // Hamming
+//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+inline int bit1Count(int v)
+{
+ v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
+ return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
+}
+#define DIST(x, y) bit1Count( (x) ^ (y) )
+typedef int value_type;
+typedef int result_type;
+#define DIST_RES(x) (x)
+#endif
+
+inline result_type reduce_block(
+ __local value_type *s_query,
+ __local value_type *s_train,
+ int lidx,
+ int lidy
+ )
+{
+ result_type result = 0;
+ #pragma unroll
+ for (int j = 0 ; j < BLOCK_SIZE ; j++)
+ {
+ result += DIST(
+ s_query[lidy * BLOCK_SIZE + j],
+ s_train[j * BLOCK_SIZE + lidx]);
+ }
+ return DIST_RES(result);
+}
+
+inline result_type reduce_block_match(
+ __local value_type *s_query,
+ __local value_type *s_train,
+ int lidx,
+ int lidy
+ )
+{
+ result_type result = 0;
+ #pragma unroll
+ for (int j = 0 ; j < BLOCK_SIZE ; j++)
+ {
+ result += DIST(
+ s_query[lidy * BLOCK_SIZE + j],
+ s_train[j * BLOCK_SIZE + lidx]);
+ }
+ return (result);
+}
+
+inline result_type reduce_multi_block(
+ __local value_type *s_query,
+ __local value_type *s_train,
+ int block_index,
+ int lidx,
+ int lidy
+ )
+{
+ result_type result = 0;
+ #pragma unroll
+ for (int j = 0 ; j < BLOCK_SIZE ; j++)
+ {
+ result += DIST(
+ s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
+ s_train[j * BLOCK_SIZE + lidx]);
+ }
+ return result;
+}
+
+/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
+local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
+*/
+__kernel void BruteForceMatch_UnrollMatch(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
+
+ int queryIdx = groupidx * BLOCK_SIZE + lidy;
+ // load the query into local memory.
+ #pragma unroll
+ for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
+ {
+ int loadx = lidx + i * BLOCK_SIZE;
+ s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ }
+
+ float myBestDistance = MAX_FLOAT;
+ int myBestTrainIdx = -1;
+
+ // loopUnrolledCached to find the best trainIdx and best distance.
+ for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
+ {
+ result_type result = 0;
+ #pragma unroll
+ for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
+ {
+ //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
+ const int loadx = lidx + i * BLOCK_SIZE;
+ s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ result = DIST_RES(result);
+
+ int trainIdx = t * BLOCK_SIZE + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ myBestDistance = result;
+ myBestTrainIdx = trainIdx;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ __local float *s_distance = (__local float*)(sharebuffer);
+ __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+
+ //find BestMatch
+ s_distance += lidy * BLOCK_SIZE;
+ s_trainIdx += lidy * BLOCK_SIZE;
+ s_distance[lidx] = myBestDistance;
+ s_trainIdx[lidx] = myBestTrainIdx;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //reduce -- now all reduce implement in each threads.
+ #pragma unroll
+ for (int k = 0 ; k < BLOCK_SIZE; k++)
+ {
+ if (myBestDistance > s_distance[k])
+ {
+ myBestDistance = s_distance[k];
+ myBestTrainIdx = s_trainIdx[k];
+ }
+ }
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = myBestTrainIdx;
+ bestDistance[queryIdx] = myBestDistance;
+ }
+}
+
+__kernel void BruteForceMatch_Match(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * BLOCK_SIZE + lidy;
+
+ float myBestDistance = MAX_FLOAT;
+ int myBestTrainIdx = -1;
+
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+
+ // loop
+ for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
+ {
+ result_type result = 0;
+ for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
+ {
+ const int loadx = lidx + i * BLOCK_SIZE;
+ //load query and train into local memory
+ s_query[lidy * BLOCK_SIZE + lidx] = 0;
+ s_train[lidx * BLOCK_SIZE + lidy] = 0;
+
+ if (loadx < query_cols)
+ {
+ s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+ s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_block_match(s_query, s_train, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ result = DIST_RES(result);
+
+ const int trainIdx = t * BLOCK_SIZE + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
+ {
+ myBestDistance = result;
+ myBestTrainIdx = trainIdx;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float *s_distance = (__local float *)sharebuffer;
+ __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+
+ //findBestMatch
+ s_distance += lidy * BLOCK_SIZE;
+ s_trainIdx += lidy * BLOCK_SIZE;
+ s_distance[lidx] = myBestDistance;
+ s_trainIdx[lidx] = myBestTrainIdx;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //reduce -- now all reduce implement in each threads.
+ for (int k = 0 ; k < BLOCK_SIZE; k++)
+ {
+ if (myBestDistance > s_distance[k])
+ {
+ myBestDistance = s_distance[k];
+ myBestTrainIdx = s_trainIdx[k];
+ }
+ }
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = myBestTrainIdx;
+ bestDistance[queryIdx] = myBestDistance;
+ }
+}
+
+//radius_unrollmatch
+__kernel void BruteForceMatch_RadiusUnrollMatch(
+ __global T *query,
+ __global T *train,
+ float maxDistance,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __global int *nMatches,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int bestTrainIdx_cols,
+ int step,
+ int ostep
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+ const int groupidy = get_group_id(1);
+
+ const int queryIdx = groupidy * BLOCK_SIZE + lidy;
+ const int trainIdx = groupidx * BLOCK_SIZE + lidx;
+
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+
+ result_type result = 0;
+ for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
+ {
+ //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
+ const int loadx = lidx + i * BLOCK_SIZE;
+
+ s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_block(s_query, s_train, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (queryIdx < query_rows && trainIdx < train_rows &&
+ convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+
+ if(ind < bestTrainIdx_cols)
+ {
+ bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+ bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+ }
+ }
+}
+
+//radius_match
+__kernel void BruteForceMatch_RadiusMatch(
+ __global T *query,
+ __global T *train,
+ float maxDistance,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __global int *nMatches,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int bestTrainIdx_cols,
+ int step,
+ int ostep
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+ const int groupidy = get_group_id(1);
+
+ const int queryIdx = groupidy * BLOCK_SIZE + lidy;
+ const int trainIdx = groupidx * BLOCK_SIZE + lidx;
+
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+
+ result_type result = 0;
+ for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
+ {
+ //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
+ const int loadx = lidx + i * BLOCK_SIZE;
+
+ s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_block(s_query, s_train, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (queryIdx < query_rows && trainIdx < train_rows &&
+ convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ int ind = atom_inc(nMatches + queryIdx);
+
+ if(ind < bestTrainIdx_cols)
+ {
+ bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+ bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+ }
+ }
+}
+
+
+__kernel void BruteForceMatch_knnUnrollMatch(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global int2 *bestTrainIdx,
+ __global float2 *bestDistance,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * BLOCK_SIZE + lidy;
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
+
+ // load the query into local memory.
+ for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
+ {
+ int loadx = lidx + i * BLOCK_SIZE;
+ s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ }
+
+ float myBestDistance1 = MAX_FLOAT;
+ float myBestDistance2 = MAX_FLOAT;
+ int myBestTrainIdx1 = -1;
+ int myBestTrainIdx2 = -1;
+
+ //loopUnrolledCached
+ for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
+ {
+ result_type result = 0;
+ for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
+ {
+ //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
+ const int loadx = lidx + i * BLOCK_SIZE;
+ s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ result = DIST_RES(result);
+
+ const int trainIdx = t * BLOCK_SIZE + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows)
+ {
+ if (result < myBestDistance1)
+ {
+ myBestDistance2 = myBestDistance1;
+ myBestTrainIdx2 = myBestTrainIdx1;
+ myBestDistance1 = result;
+ myBestTrainIdx1 = trainIdx;
+ }
+ else if (result < myBestDistance2)
+ {
+ myBestDistance2 = result;
+ myBestTrainIdx2 = trainIdx;
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float *s_distance = (local float *)sharebuffer;
+ __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+
+ // find BestMatch
+ s_distance += lidy * BLOCK_SIZE;
+ s_trainIdx += lidy * BLOCK_SIZE;
+
+ s_distance[lidx] = myBestDistance1;
+ s_trainIdx[lidx] = myBestTrainIdx1;
+
+ float bestDistance1 = MAX_FLOAT;
+ float bestDistance2 = MAX_FLOAT;
+ int bestTrainIdx1 = -1;
+ int bestTrainIdx2 = -1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < BLOCK_SIZE ; i++)
+ {
+ float val = s_distance[i];
+ if (val < bestDistance1)
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+
+ bestDistance1 = val;
+ bestTrainIdx1 = s_trainIdx[i];
+ }
+ else if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ s_distance[lidx] = myBestDistance2;
+ s_trainIdx[lidx] = myBestTrainIdx2;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < BLOCK_SIZE ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ myBestDistance1 = bestDistance1;
+ myBestDistance2 = bestDistance2;
+
+ myBestTrainIdx1 = bestTrainIdx1;
+ myBestTrainIdx2 = bestTrainIdx2;
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+ bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+ }
+}
+
+__kernel void BruteForceMatch_knnMatch(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global int2 *bestTrainIdx,
+ __global float2 *bestDistance,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * BLOCK_SIZE + lidy;
+ __local value_type *s_query = (__local value_type *)sharebuffer;
+ __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+
+ float myBestDistance1 = MAX_FLOAT;
+ float myBestDistance2 = MAX_FLOAT;
+ int myBestTrainIdx1 = -1;
+ int myBestTrainIdx2 = -1;
+
+ //loop
+ for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
+ {
+ result_type result = 0.0f;
+ for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
+ {
+ const int loadx = lidx + i * BLOCK_SIZE;
+ //load query and train into local memory
+ s_query[lidy * BLOCK_SIZE + lidx] = 0;
+ s_train[lidx * BLOCK_SIZE + lidy] = 0;
+
+ if (loadx < query_cols)
+ {
+ s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+ s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ result += reduce_block_match(s_query, s_train, lidx, lidy);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ result = DIST_RES(result);
+
+ const int trainIdx = t * BLOCK_SIZE + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+ {
+ if (result < myBestDistance1)
+ {
+ myBestDistance2 = myBestDistance1;
+ myBestTrainIdx2 = myBestTrainIdx1;
+ myBestDistance1 = result;
+ myBestTrainIdx1 = trainIdx;
+ }
+ else if (result < myBestDistance2)
+ {
+ myBestDistance2 = result;
+ myBestTrainIdx2 = trainIdx;
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float *s_distance = (__local float *)sharebuffer;
+ __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+
+ //findBestMatch
+ s_distance += lidy * BLOCK_SIZE;
+ s_trainIdx += lidy * BLOCK_SIZE;
+
+ s_distance[lidx] = myBestDistance1;
+ s_trainIdx[lidx] = myBestTrainIdx1;
+
+ float bestDistance1 = MAX_FLOAT;
+ float bestDistance2 = MAX_FLOAT;
+ int bestTrainIdx1 = -1;
+ int bestTrainIdx2 = -1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < BLOCK_SIZE ; i++)
+ {
+ float val = s_distance[i];
+ if (val < bestDistance1)
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+
+ bestDistance1 = val;
+ bestTrainIdx1 = s_trainIdx[i];
+ }
+ else if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ s_distance[lidx] = myBestDistance2;
+ s_trainIdx[lidx] = myBestTrainIdx2;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < BLOCK_SIZE ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ myBestDistance1 = bestDistance1;
+ myBestDistance2 = bestDistance2;
+
+ myBestTrainIdx1 = bestTrainIdx1;
+ myBestTrainIdx2 = bestTrainIdx2;
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+ bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+ }
+}
+
+kernel void BruteForceMatch_calcDistanceUnrolled(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global float *allDist,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step)
+{
+ /* Todo */
+}
+
+kernel void BruteForceMatch_calcDistance(
+ __global T *query,
+ __global T *train,
+ //__global float *mask,
+ __global float *allDist,
+ __local float *sharebuffer,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step)
+{
+ /* Todo */
+}
+
+kernel void BruteForceMatch_findBestMatch(
+ __global float *allDist,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ int k
+)
+{
+ /* Todo */
+}