From ca5689e0dbb2c91f3a9d3072074786f0dc66d054 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Fri, 27 Dec 2013 13:04:02 +0400 Subject: [PATCH] BFMatcher match radiusMatch --- modules/core/include/opencv2/core/mat.hpp | 3 +- modules/core/include/opencv2/core/mat.inl.hpp | 2 +- modules/core/src/matrix.cpp | 36 + modules/features2d/include/opencv2/features2d.hpp | 51 +- modules/features2d/src/matchers.cpp | 911 ++++++++++++++++++++- modules/features2d/src/opencl/brute_force_match.cl | 789 ++++++++++++++++++ modules/features2d/src/precomp.hpp | 1 + .../test/ocl/test_brute_force_matcher.cpp | 213 +++++ 8 files changed, 1945 insertions(+), 61 deletions(-) create mode 100644 modules/features2d/src/opencl/brute_force_match.cl create mode 100644 modules/features2d/test/ocl/test_brute_force_matcher.cpp diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index a7a4f1b..6c09efd 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -113,6 +113,7 @@ public: virtual Mat getMat(int idx=-1) const; virtual UMat getUMat(int idx=-1) const; virtual void getMatVector(std::vector& mv) const; + virtual void getUMatVector(std::vector& umv) const; virtual cuda::GpuMat getGpuMat() const; virtual ogl::Buffer getOGlBuffer() const; void* getObj() const; @@ -134,7 +135,7 @@ public: virtual size_t step(int i=-1) const; bool isMat() const; bool isUMat() const; - bool isMatVectot() const; + bool isMatVector() const; bool isUMatVector() const; bool isMatx(); diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp index ed0d18d..fd7b061 100644 --- a/modules/core/include/opencv2/core/mat.inl.hpp +++ b/modules/core/include/opencv2/core/mat.inl.hpp @@ -110,7 +110,7 @@ inline _InputArray::~_InputArray() {} inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; } inline bool _InputArray::isUMat() const { return kind() == _InputArray::UMAT; } -inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; } +inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; } inline bool _InputArray::isUMatVector() const { return kind() == _InputArray::STD_VECTOR_UMAT; } inline bool _InputArray::isMatx() { return kind() == _InputArray::MATX; } diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 2d97319..48576cc 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -1324,6 +1324,42 @@ void _InputArray::getMatVector(std::vector& mv) const CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type"); } +void _InputArray::getUMatVector(std::vector& umv) const +{ + int k = kind(); + int accessFlags = flags & ACCESS_MASK; + + if( k == NONE ) + { + umv.clear(); + return; + } + + if( k == STD_VECTOR_MAT ) + { + const std::vector& v = *(const std::vector*)obj; + size_t i, n = v.size(); + umv.resize(n); + + for( i = 0; i < n; i++ ) + umv[i] = v[i].getUMat(accessFlags); + return; + } + + if( k == STD_VECTOR_UMAT ) + { + const std::vector& v = *(const std::vector*)obj; + size_t i, n = v.size(); + umv.resize(n); + + for( i = 0; i < n; i++ ) + umv[i] = v[i]; + return; + } + + CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type"); +} + cuda::GpuMat _InputArray::getGpuMat() const { int k = kind(); diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp index 6aad4be..5936d93 100644 --- a/modules/features2d/include/opencv2/features2d.hpp +++ b/modules/features2d/include/opencv2/features2d.hpp @@ -998,7 +998,7 @@ public: * Add descriptors to train descriptor collection. * descriptors Descriptors to add. Each descriptors[i] is a descriptors set from one image. */ - CV_WRAP virtual void add( const std::vector& descriptors ); + CV_WRAP virtual void add( InputArray descriptors ); /* * Get train descriptors collection. */ @@ -1034,29 +1034,29 @@ public: * Method train() is run in this methods. */ // Find one best match for each query descriptor (if mask is empty). - CV_WRAP void match( const Mat& queryDescriptors, const Mat& trainDescriptors, - CV_OUT std::vector& matches, const Mat& mask=Mat() ) const; + CV_WRAP void match( InputArray queryDescriptors, InputArray trainDescriptors, + CV_OUT std::vector& matches, InputArray mask=Mat() ) const; // Find k best matches for each query descriptor (in increasing order of distances). // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. - CV_WRAP void knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, + CV_WRAP void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors, CV_OUT std::vector >& matches, int k, - const Mat& mask=Mat(), bool compactResult=false ) const; + InputArray mask=Mat(), bool compactResult=false ) const; // Find best matches for each query descriptor which have distance less than // maxDistance (in increasing order of distances). - void radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, + void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors, std::vector >& matches, float maxDistance, - const Mat& mask=Mat(), bool compactResult=false ) const; + InputArray mask=Mat(), bool compactResult=false ) const; /* * Group of methods to match descriptors from one image to image set. * See description of similar methods for matching image pair above. */ - CV_WRAP void match( const Mat& queryDescriptors, CV_OUT std::vector& matches, + CV_WRAP void match( InputArray queryDescriptors, CV_OUT std::vector& matches, const std::vector& masks=std::vector() ); - CV_WRAP void knnMatch( const Mat& queryDescriptors, CV_OUT std::vector >& matches, int k, + CV_WRAP void knnMatch( InputArray queryDescriptors, CV_OUT std::vector >& matches, int k, const std::vector& masks=std::vector(), bool compactResult=false ); - void radiusMatch( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, + void radiusMatch( InputArray queryDescriptors, std::vector >& matches, float maxDistance, const std::vector& masks=std::vector(), bool compactResult=false ); // Reads matcher object from a file node @@ -1101,10 +1101,10 @@ protected: // In fact the matching is implemented only by the following two methods. These methods suppose // that the class object has been trained already. Public match methods call these methods // after calling train(). - virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, int k, - const std::vector& masks=std::vector(), bool compactResult=false ) = 0; - virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, - const std::vector& masks=std::vector(), bool compactResult=false ) = 0; + virtual void knnMatchImpl( InputArray queryDescriptors, std::vector >& matches, int k, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ) = 0; + virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector >& matches, float maxDistance, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ) = 0; static bool isPossibleMatch( const Mat& mask, int queryIdx, int trainIdx ); static bool isMaskedOut( const std::vector& masks, int queryIdx ); @@ -1114,6 +1114,7 @@ protected: // Collection of descriptors from train images. std::vector trainDescCollection; + std::vector utrainDescCollection; }; /* @@ -1137,10 +1138,16 @@ public: AlgorithmInfo* info() const; protected: - virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, int k, - const std::vector& masks=std::vector(), bool compactResult=false ); - virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, - const std::vector& masks=std::vector(), bool compactResult=false ); + virtual void knnMatchImpl( InputArray queryDescriptors, std::vector >& matches, int k, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ); + virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector >& matches, float maxDistance, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ); + + bool ocl_knnMatch(InputArray query, InputArray train, std::vector< std::vector > &matches, + int k, int dstType, bool compactResult=false); + bool ocl_radiusMatch(InputArray query, InputArray train, std::vector< std::vector > &matches, + float maxDistance, int dstType, bool compactResult=false); + bool ocl_match(InputArray query, InputArray train, std::vector< std::vector > &matches, int dstType); int normType; bool crossCheck; @@ -1175,10 +1182,10 @@ protected: const Mat& indices, const Mat& distances, std::vector >& matches ); - virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, int k, - const std::vector& masks=std::vector(), bool compactResult=false ); - virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, - const std::vector& masks=std::vector(), bool compactResult=false ); + virtual void knnMatchImpl( InputArray queryDescriptors, std::vector >& matches, int k, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ); + virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector >& matches, float maxDistance, + InputArrayOfArrays masks=std::vector(), bool compactResult=false ); Ptr indexParams; Ptr searchParams; diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp index 087c6a7..5b692b9 100644 --- a/modules/features2d/src/matchers.cpp +++ b/modules/features2d/src/matchers.cpp @@ -41,6 +41,7 @@ #include "precomp.hpp" #include +#include "opencl_kernels.hpp" #if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 2 #include @@ -68,6 +69,680 @@ Mat windowedMatchingMask( const std::vector& keypoints1, const std::ve return mask; } +//////////////////////////////////////////////////////////////////ocl functions for BFMatcher /////////////////////////////////////////////////////////////// + +static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m) +{ + if (m.type() == type && m.rows >= rows && m.cols >= cols) + m = m(Rect(0, 0, cols, rows)); + else + m.create(rows, cols, type); +} + + +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > +static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, + const UMat &trainIdx, const UMat &distance, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN ); + ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, (void *)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +template < int BLOCK_SIZE/*, typename Mask*/ > +static bool ocl_match(InputArray _query, InputArray _train, + const UMat &trainIdx, const UMat &distance, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); + ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, (void *)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +static bool ocl_matchDispatcher(InputArray query, InputArray train, + const UMat &trainIdx, const UMat &distance, int distType) +{ + int query_cols = query.size().width; + bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; + if (query_cols <= 64) + { + if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false; + } + else if (query_cols <= 128 && !is_cpu) + { + if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) return false; + } + else + { + if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false; + } + return true; +} + +static bool ocl_matchSingle(InputArray query, InputArray train, + UMat &trainIdx, UMat &distance, int dstType) +{ + if (query.empty() || train.empty()) + return false; + + int query_rows = query.size().height; + + ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx); + ensureSizeIsEnough(1, query_rows, CV_32F, distance); + + return ocl_matchDispatcher(query, train, trainIdx, distance, dstType); +} + +static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector > &matches) +{ + if (trainIdx.empty() || distance.empty()) + return false; + + if( (trainIdx.type() != CV_32SC1) || (distance.type() != CV_32FC1 || distance.cols != trainIdx.cols) ) + return false; + + const int nQuery = trainIdx.cols; + + matches.clear(); + matches.reserve(nQuery); + + const int *trainIdx_ptr = trainIdx.ptr(); + const float *distance_ptr = distance.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr) + { + int trainIndex = *trainIdx_ptr; + + if (trainIndex == -1) + continue; + + float dst = *distance_ptr; + + DMatch m(queryIdx, trainIndex, 0, dst); + + std::vector temp; + temp.push_back(m); + matches.push_back(temp); + } + return true; +} + +static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector > &matches) +{ + if (trainIdx.empty() || distance.empty()) + return false; + + Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ); + Mat distanceCPU = distance.getMat(ACCESS_READ); + + return ocl_matchConvert(trainIdxCPU, distanceCPU, matches); +} + +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > +static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train, + const UMat &trainIdx, const UMat &distance, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN ); + ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, (void *)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +template < int BLOCK_SIZE/*, typename Mask*/ > +static bool ocl_knn_match(InputArray _query, InputArray _train, + const UMat &trainIdx, const UMat &distance, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); + ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, (void*)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType) +{ + bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; + if (query.size().width <= 64) + { + if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) + return false; + } + else if (query.size().width <= 128 && !is_cpu) + { + if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) + return false; + } + else + { + if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType)) + return false; + } + return true; +} + +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > +static bool ocl_calcDistanceUnrolled(InputArray _query, InputArray _train, const UMat &allDist, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN); + ocl::Kernel k("BruteForceMatch_calcDistanceUnrolled", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().width + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(allDist)); + idx = k.set(idx, (void*)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + k.run(2, globalSize, localSize, false); + } + return false;// TODO in KERNEL +} + +template < int BLOCK_SIZE/*, typename Mask*/ > +static bool ocl_calcDistance(InputArray _query, InputArray _train, const UMat &allDist, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); + ocl::Kernel k("BruteForceMatch_calcDistance", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_query.size().width + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(allDist)); + idx = k.set(idx, (void*)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, (int)query.step); + + k.run(2, globalSize, localSize, false); + } + return false;// TODO in KERNEL +} + +static bool ocl_calcDistanceDispatcher(InputArray query, InputArray train, const UMat &allDist, int distType) +{ + if (query.size().width <= 64) + { + if(!ocl_calcDistanceUnrolled<16, 64>(query, train, allDist, distType)) return false; + } + else if (query.size().width <= 128) + { + if(!ocl_calcDistanceUnrolled<16, 128>(query, train, allDist, distType)) return false; + } + else + { + if(!ocl_calcDistance<16>(query, train, allDist, distType)) return false; + } + return true; +} + +template +static bool ocl_findKnnMatch(int k, const UMat &trainIdx, const UMat &distance, const UMat &allDist, int /*distType*/) +{ + return false;// TODO in KERNEL + + std::vector kernels; + for (int i = 0; i < k; ++i) + { + ocl::Kernel kernel("BruteForceMatch_findBestMatch", ocl::features2d::brute_force_match_oclsrc); + if(kernel.empty()) + return false; + kernels.push_back(kernel); + } + + size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1}; + size_t localSize[] = {BLOCK_SIZE, 1, 1}; + int block_size = BLOCK_SIZE; + + for (int i = 0; i < k; ++i) + { + int idx = 0; + idx = kernels[i].set(idx, ocl::KernelArg::PtrReadOnly(allDist)); + idx = kernels[i].set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = kernels[i].set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = kernels[i].set(idx, i); + idx = kernels[i].set(idx, block_size); +// idx = kernels[i].set(idx, train.rows); +// idx = kernels[i].set(idx, train.cols); +// idx = kernels[i].set(idx, query.step); + + if(!kernels[i].run(2, globalSize, localSize, false)) + return false; + } + return true; +} + +static bool ocl_findKnnMatchDispatcher(int k, const UMat &trainIdx, const UMat &distance, const UMat &allDist, int distType) +{ + return ocl_findKnnMatch<256>(k, trainIdx, distance, allDist, distType); +} + +static bool ocl_kmatchDispatcher(InputArray query, InputArray train, int k, const UMat &trainIdx, + const UMat &distance, const UMat &allDist, int distType) +{ + if(k == 2) + { + if( !ocl_match2Dispatcher(query, train, trainIdx, distance, distType) ) return false; + } + else + { + if( !ocl_calcDistanceDispatcher(query, train, allDist, distType) ) return false; + if( !ocl_findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType) ) return false; + } + return true; +} + +static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx, + UMat &distance, UMat &allDist, int k, int dstType) +{ + if (query.empty() || train.empty()) + return false; + + const int nQuery = query.size().height; + const int nTrain = train.size().height; + + if (k == 2) + { + ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); + ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); + } + else + { + ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); + ensureSizeIsEnough(nQuery, k, CV_32F, distance); + ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); + } + + trainIdx.setTo(Scalar::all(-1)); + + return ocl_kmatchDispatcher(query, train, k, trainIdx, distance, allDist, dstType); +} + +static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector > &matches, bool compactResult) +{ + if (trainIdx.empty() || distance.empty()) + return false; + + if(trainIdx.type() != CV_32SC2 && trainIdx.type() != CV_32SC1) return false; + if(distance.type() != CV_32FC2 && distance.type() != CV_32FC1)return false; + if(distance.size() != trainIdx.size()) return false; + if(!trainIdx.isContinuous() || !distance.isContinuous()) return false; + + const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows; + const int k = trainIdx.type() == CV_32SC2 ? 2 : trainIdx.cols; + + matches.clear(); + matches.reserve(nQuery); + + const int *trainIdx_ptr = trainIdx.ptr(); + const float *distance_ptr = distance.ptr(); + + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + matches.push_back(std::vector()); + std::vector &curMatches = matches.back(); + curMatches.reserve(k); + + for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr) + { + int trainIndex = *trainIdx_ptr; + + if (trainIndex != -1) + { + float dst = *distance_ptr; + + DMatch m(queryIdx, trainIndex, 0, dst); + + curMatches.push_back(m); + } + } + + if (compactResult && curMatches.empty()) + matches.pop_back(); + } + return true; +} + +static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector > &matches, bool compactResult) +{ + if (trainIdx.empty() || distance.empty()) + return false; + + Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ); + Mat distanceCPU = distance.getMat(ACCESS_READ); + + if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) ) + return true; + return false; +} + +template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > +static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance, + const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN); + ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, maxDistance); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches)); + idx = k.set(idx, (void*)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, trainIdx.cols); + idx = k.set(idx, (int)query.step); + idx = k.set(idx, (int)trainIdx.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +//radius_match +template < int BLOCK_SIZE/*, typename Mask*/ > +static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance, + const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType) +{ + int depth = _query.depth(); + cv::String opts; + opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); + ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) + return false; + + size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; + size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + if(globalSize[0] != 0) + { + UMat query = _query.getUMat(), train = _train.getUMat(); + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); + idx = k.set(idx, maxDistance); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches)); + idx = k.set(idx, (void*)NULL, smemSize); + idx = k.set(idx, query.rows); + idx = k.set(idx, query.cols); + idx = k.set(idx, train.rows); + idx = k.set(idx, train.cols); + idx = k.set(idx, trainIdx.cols); + idx = k.set(idx, (int)query.step); + idx = k.set(idx, (int)trainIdx.step); + + return k.run(2, globalSize, localSize, false); + } + return true; +} + +static bool ocl_rmatchDispatcher(InputArray query, InputArray train, + UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType) +{ + bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; + int query_cols = query.size().width; + if (query_cols <= 64) + { + if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; + } + else if (query_cols <= 128 && !is_cpu) + { + if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; + } + else + { + if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; + } + return true; +} + + +static bool ocl_radiusMatchSingle(InputArray query, InputArray train, + UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType) +{ + if (query.empty() || train.empty()) + return false; + + const int nQuery = query.size().height; + const int nTrain = train.size().height; + + ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); + + if (trainIdx.empty()) + { + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); + } + + nMatches.setTo(Scalar::all(0)); + + return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType); +} + +static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches, + std::vector< std::vector > &matches, bool compactResult) +{ + if (trainIdx.empty() || distance.empty() || _nMatches.empty()) + return false; + + if( (trainIdx.type() != CV_32SC1) || + (distance.type() != CV_32FC1 || distance.size() != trainIdx.size()) || + (_nMatches.type() != CV_32SC1 || _nMatches.cols != trainIdx.rows) ) + return false; + + const int nQuery = trainIdx.rows; + + matches.clear(); + matches.reserve(nQuery); + + const int *nMatches_ptr = _nMatches.ptr(); + + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + const int *trainIdx_ptr = trainIdx.ptr(queryIdx); + const float *distance_ptr = distance.ptr(queryIdx); + + const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols); + + if (nMatches == 0) + { + if (!compactResult) + matches.push_back(std::vector()); + continue; + } + + matches.push_back(std::vector(nMatches)); + std::vector &curMatches = matches.back(); + + for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr) + { + int trainIndex = *trainIdx_ptr; + + float dst = *distance_ptr; + + DMatch m(queryIdx, trainIndex, 0, dst); + + curMatches[i] = m; + } + + std::sort(curMatches.begin(), curMatches.end()); + } + return true; +} + +static bool ocl_radiusMatchDownload(const UMat &trainIdx, const UMat &distance, const UMat &nMatches, + std::vector< std::vector > &matches, bool compactResult) +{ + if (trainIdx.empty() || distance.empty() || nMatches.empty()) + return false; + + Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ); + Mat distanceCPU = distance.getMat(ACCESS_READ); + Mat nMatchesCPU = nMatches.getMat(ACCESS_READ); + + return ocl_radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); +} + /****************************************************************************************\ * DescriptorMatcher * \****************************************************************************************/ @@ -190,9 +865,32 @@ static void convertMatches( const std::vector >& knnMatches, DescriptorMatcher::~DescriptorMatcher() {} -void DescriptorMatcher::add( const std::vector& descriptors ) +void DescriptorMatcher::add( InputArrayOfArrays _descriptors ) { - trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() ); + if(_descriptors.isUMatVector()) + { + std::vector descriptors; + _descriptors.getUMatVector(descriptors); + utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() ); + } + else if(_descriptors.isUMat()) + { + std::vector descriptors = std::vector(1, _descriptors.getUMat()); + utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() ); + } + else if(_descriptors.isMatVector()) + { + std::vector descriptors; + _descriptors.getMatVector(descriptors); + trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() ); + } + else if(_descriptors.isMat()) + { + std::vector descriptors = std::vector(1, _descriptors.getMat()); + trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() ); + } + else + CV_Assert( _descriptors.isUMat() || _descriptors.isUMatVector() || _descriptors.isMat() || _descriptors.isMatVector() ); } const std::vector& DescriptorMatcher::getTrainDescriptors() const @@ -202,41 +900,45 @@ const std::vector& DescriptorMatcher::getTrainDescriptors() const void DescriptorMatcher::clear() { + utrainDescCollection.clear(); trainDescCollection.clear(); } bool DescriptorMatcher::empty() const { - return trainDescCollection.empty(); + return trainDescCollection.empty() && utrainDescCollection.empty(); } void DescriptorMatcher::train() {} -void DescriptorMatcher::match( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector& matches, const Mat& mask ) const +void DescriptorMatcher::match( InputArray queryDescriptors, InputArray trainDescriptors, + std::vector& matches, InputArray mask ) const { Ptr tempMatcher = clone(true); - tempMatcher->add( std::vector(1, trainDescriptors) ); - tempMatcher->match( queryDescriptors, matches, std::vector(1, mask) ); + tempMatcher->add(trainDescriptors); + tempMatcher->match( queryDescriptors, matches, std::vector(1, mask.getMat()) ); } -void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector >& matches, int knn, - const Mat& mask, bool compactResult ) const +void DescriptorMatcher::knnMatch( InputArray queryDescriptors, InputArray trainDescriptors, + std::vector >& matches, int knn, + InputArray mask, bool compactResult ) const { Ptr tempMatcher = clone(true); - tempMatcher->add( std::vector(1, trainDescriptors) ); - tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector(1, mask), compactResult ); + tempMatcher->add(trainDescriptors); + tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector(1, mask.getMat()), compactResult ); } -void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector >& matches, float maxDistance, - const Mat& mask, bool compactResult ) const +void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors, + std::vector >& matches, float maxDistance, InputArray mask, + bool compactResult ) const { Ptr tempMatcher = clone(true); - tempMatcher->add( std::vector(1, trainDescriptors) ); - tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector(1, mask), compactResult ); + tempMatcher->add(trainDescriptors); + tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector(1, mask.getMat()), compactResult ); } -void DescriptorMatcher::match( const Mat& queryDescriptors, std::vector& matches, const std::vector& masks ) +void DescriptorMatcher::match( InputArray queryDescriptors, std::vector& matches, const std::vector& masks ) { std::vector > knnMatches; knnMatch( queryDescriptors, knnMatches, 1, masks, true /*compactResult*/ ); @@ -248,36 +950,36 @@ void DescriptorMatcher::checkMasks( const std::vector& masks, int queryDesc if( isMaskSupported() && !masks.empty() ) { // Check masks - size_t imageCount = trainDescCollection.size(); + size_t imageCount = std::max(trainDescCollection.size(), utrainDescCollection.size() ); CV_Assert( masks.size() == imageCount ); for( size_t i = 0; i < imageCount; i++ ) { - if( !masks[i].empty() && !trainDescCollection[i].empty() ) + if( !masks[i].empty() && (!trainDescCollection[i].empty() || !utrainDescCollection[i].empty() ) ) { + int rows = trainDescCollection[i].empty() ? utrainDescCollection[i].rows : trainDescCollection[i].rows; CV_Assert( masks[i].rows == queryDescriptorsCount && - masks[i].cols == trainDescCollection[i].rows && - masks[i].type() == CV_8UC1 ); + (masks[i].cols == rows || masks[i].cols == rows) && + masks[i].type() == CV_8UC1 ); } } } } -void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, std::vector >& matches, int knn, +void DescriptorMatcher::knnMatch( InputArray queryDescriptors, std::vector >& matches, int knn, const std::vector& masks, bool compactResult ) { - matches.clear(); if( empty() || queryDescriptors.empty() ) return; CV_Assert( knn > 0 ); - checkMasks( masks, queryDescriptors.rows ); + checkMasks( masks, queryDescriptors.size().height ); train(); knnMatchImpl( queryDescriptors, matches, knn, masks, compactResult ); } -void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, +void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, std::vector >& matches, float maxDistance, const std::vector& masks, bool compactResult ) { matches.clear(); @@ -286,7 +988,7 @@ void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, std::vector std::numeric_limits::epsilon() ); - checkMasks( masks, queryDescriptors.rows ); + checkMasks( masks, queryDescriptors.size().height ); train(); radiusMatchImpl( queryDescriptors, matches, maxDistance, masks, compactResult ); @@ -316,7 +1018,7 @@ bool DescriptorMatcher::isMaskedOut( const std::vector& masks, int queryIdx } -/////////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////// BruteForceMatcher ///////////////////////////////////////////////// BFMatcher::BFMatcher( int _normType, bool _crossCheck ) { @@ -336,19 +1038,97 @@ Ptr BFMatcher::clone( bool emptyTrainData ) const return matcher; } +bool BFMatcher::ocl_match(InputArray query, InputArray _train, std::vector< std::vector > &matches, int dstType) +{ + UMat trainIdx, distance; + if(!ocl_matchSingle(query, _train, trainIdx, distance, dstType)) return false; + if(!ocl_matchDownload(trainIdx, distance, matches)) return false; + return true; +} + +bool BFMatcher::ocl_knnMatch(InputArray query, InputArray _train, std::vector< std::vector > &matches, int k, int dstType, bool compactResult) +{ + UMat trainIdx, distance, allDist; + if (!ocl_knnMatchSingle(query, _train, trainIdx, distance, allDist, k, dstType)) return false; + if( !ocl_knnMatchDownload(trainIdx, distance, matches, compactResult) ) return false; + return true; +} -void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, int knn, - const std::vector& masks, bool compactResult ) +void BFMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector >& matches, int knn, + InputArrayOfArrays _masks, bool compactResult ) { + int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type(); + CV_Assert( _queryDescriptors.type() == trainDescType ); + const int IMGIDX_SHIFT = 18; const int IMGIDX_ONE = (1 << IMGIDX_SHIFT); - if( queryDescriptors.empty() || trainDescCollection.empty() ) + if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty())) { matches.clear(); return; } - CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() ); + + std::vector masks; + _masks.getMatVector(masks); + + if(!trainDescCollection.empty() && !utrainDescCollection.empty()) + { + for(int i = 0; i < (int)utrainDescCollection.size(); i++) + { + Mat tempMat; + utrainDescCollection[i].copyTo(tempMat); + trainDescCollection.push_back(tempMat); + } + utrainDescCollection.clear(); + } + + int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size(); + Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size(); + + if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 && + _queryDescriptors.type() == CV_32FC1 && + trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 ) + { + if(knn == 1) + { + if(trainDescCollection.empty()) + { + if(ocl_match(_queryDescriptors, utrainDescCollection[0], matches, normType)) + return; + } + else + { + if(ocl_match(_queryDescriptors, trainDescCollection[0], matches, normType)) + return; + } + } + else + { + if(trainDescCollection.empty()) + { + if(ocl_knnMatch(_queryDescriptors, utrainDescCollection[0], matches, knn, normType, compactResult) ) + return; + } + else + { + if(ocl_knnMatch(_queryDescriptors, trainDescCollection[0], matches, knn, normType, compactResult) ) + return; + } + } + } + + Mat queryDescriptors = _queryDescriptors.getMat(); + if(trainDescCollection.empty() && !utrainDescCollection.empty()) + { + for(int i = 0; i < (int)utrainDescCollection.size(); i++) + { + Mat tempMat; + utrainDescCollection[i].copyTo(tempMat); + trainDescCollection.push_back(tempMat); + } + utrainDescCollection.clear(); + } matches.reserve(queryDescriptors.rows); @@ -397,16 +1177,71 @@ void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector > &matches, + float maxDistance, int dstType, bool compactResult) +{ + UMat trainIdx, distance, nMatches; + if(!ocl_radiusMatchSingle(query, _train, trainIdx, distance, nMatches, maxDistance, dstType)) return false; + if(!ocl_radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult)) return false; + return true; +} -void BFMatcher::radiusMatchImpl( const Mat& queryDescriptors, std::vector >& matches, - float maxDistance, const std::vector& masks, bool compactResult ) +void BFMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector >& matches, + float maxDistance, InputArrayOfArrays _masks, bool compactResult ) { - if( queryDescriptors.empty() || trainDescCollection.empty() ) + int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type(); + CV_Assert( _queryDescriptors.type() == trainDescType ); + + if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty())) { matches.clear(); return; } - CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() ); + + std::vector masks; + _masks.getMatVector(masks); + + if(!trainDescCollection.empty() && !utrainDescCollection.empty()) + { + for(int i = 0; i < (int)utrainDescCollection.size(); i++) + { + Mat tempMat; + utrainDescCollection[i].copyTo(tempMat); + trainDescCollection.push_back(tempMat); + } + utrainDescCollection.clear(); + } + + int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size(); + Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size(); + + if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 && + _queryDescriptors.type() == CV_32FC1 && + trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 ) + { + if(trainDescCollection.empty()) + { + if(ocl_radiusMatch(_queryDescriptors, utrainDescCollection[0], matches, maxDistance, normType, compactResult) ) + return; + } + else + { + if(ocl_radiusMatch(_queryDescriptors, trainDescCollection[0], matches, maxDistance, normType, compactResult) ) + return; + } + } + + Mat queryDescriptors = _queryDescriptors.getMat(); + if(trainDescCollection.empty() && !utrainDescCollection.empty()) + { + for(int i = 0; i < (int)utrainDescCollection.size(); i++) + { + Mat tempMat; + utrainDescCollection[i].copyTo(tempMat); + trainDescCollection.push_back(tempMat); + } + utrainDescCollection.clear(); + } matches.resize(queryDescriptors.rows); Mat dist, distf; @@ -763,9 +1598,10 @@ void FlannBasedMatcher::convertToDMatches( const DescriptorCollection& collectio } } -void FlannBasedMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, int knn, - const std::vector& /*masks*/, bool /*compactResult*/ ) +void FlannBasedMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector >& matches, int knn, + InputArrayOfArrays /*masks*/, bool /*compactResult*/ ) { + Mat queryDescriptors = _queryDescriptors.getMat(); Mat indices( queryDescriptors.rows, knn, CV_32SC1 ); Mat dists( queryDescriptors.rows, knn, CV_32FC1); flannIndex->knnSearch( queryDescriptors, indices, dists, knn, *searchParams ); @@ -773,9 +1609,10 @@ void FlannBasedMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector >& matches, float maxDistance, - const std::vector& /*masks*/, bool /*compactResult*/ ) +void FlannBasedMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector >& matches, float maxDistance, + InputArrayOfArrays /*masks*/, bool /*compactResult*/ ) { + Mat queryDescriptors = _queryDescriptors.getMat(); const int count = mergedDescriptors.size(); // TODO do count as param? Mat indices( queryDescriptors.rows, count, CV_32SC1, Scalar::all(-1) ); Mat dists( queryDescriptors.rows, count, CV_32FC1, Scalar::all(-1) ); diff --git a/modules/features2d/src/opencl/brute_force_match.cl b/modules/features2d/src/opencl/brute_force_match.cl new file mode 100644 index 0000000..e2757e1 --- /dev/null +++ b/modules/features2d/src/opencl/brute_force_match.cl @@ -0,0 +1,789 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com +// Baichuan Su, baichuan@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable +#define MAX_FLOAT 3.40282e+038f + +#ifndef T +#define T float +#endif + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 16 +#endif +#ifndef MAX_DESC_LEN +#define MAX_DESC_LEN 64 +#endif + +#ifndef DIST_TYPE +#define DIST_TYPE 2 +#endif + +// dirty fix for non-template support +#if (DIST_TYPE == 2) // L1Dist +# ifdef T_FLOAT +# define DIST(x, y) fabs((x) - (y)) + typedef float value_type; + typedef float result_type; +# else +# define DIST(x, y) abs((x) - (y)) + typedef int value_type; + typedef int result_type; +# endif +#define DIST_RES(x) (x) +#elif (DIST_TYPE == 4) // L2Dist +#define DIST(x, y) (((x) - (y)) * ((x) - (y))) +typedef float value_type; +typedef float result_type; +#define DIST_RES(x) sqrt(x) +#elif (DIST_TYPE == 6) // Hamming +//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel +inline int bit1Count(int v) +{ + v = v - ((v >> 1) & 0x55555555); // reuse input as temporary + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count +} +#define DIST(x, y) bit1Count( (x) ^ (y) ) +typedef int value_type; +typedef int result_type; +#define DIST_RES(x) (x) +#endif + +inline result_type reduce_block( + __local value_type *s_query, + __local value_type *s_train, + int lidx, + int lidy + ) +{ + result_type result = 0; + #pragma unroll + for (int j = 0 ; j < BLOCK_SIZE ; j++) + { + result += DIST( + s_query[lidy * BLOCK_SIZE + j], + s_train[j * BLOCK_SIZE + lidx]); + } + return DIST_RES(result); +} + +inline result_type reduce_block_match( + __local value_type *s_query, + __local value_type *s_train, + int lidx, + int lidy + ) +{ + result_type result = 0; + #pragma unroll + for (int j = 0 ; j < BLOCK_SIZE ; j++) + { + result += DIST( + s_query[lidy * BLOCK_SIZE + j], + s_train[j * BLOCK_SIZE + lidx]); + } + return (result); +} + +inline result_type reduce_multi_block( + __local value_type *s_query, + __local value_type *s_train, + int block_index, + int lidx, + int lidy + ) +{ + result_type result = 0; + #pragma unroll + for (int j = 0 ; j < BLOCK_SIZE ; j++) + { + result += DIST( + s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], + s_train[j * BLOCK_SIZE + lidx]); + } + return result; +} + +/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE +local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE. +*/ +__kernel void BruteForceMatch_UnrollMatch( + __global T *query, + __global T *train, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; + + int queryIdx = groupidx * BLOCK_SIZE + lidy; + // load the query into local memory. + #pragma unroll + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) + { + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + // loopUnrolledCached to find the best trainIdx and best distance. + for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) + { + result_type result = 0; + #pragma unroll + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) + { + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + result = DIST_RES(result); + + int trainIdx = t * BLOCK_SIZE + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) + { + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + __local float *s_distance = (__local float*)(sharebuffer); + __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); + + //find BestMatch + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + #pragma unroll + for (int k = 0 ; k < BLOCK_SIZE; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } +} + +__kernel void BruteForceMatch_Match( + __global T *query, + __global T *train, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * BLOCK_SIZE + lidy; + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; + + // loop + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) + { + result_type result = 0; + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++) + { + const int loadx = lidx + i * BLOCK_SIZE; + //load query and train into local memory + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_block_match(s_query, s_train, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + result = DIST_RES(result); + + const int trainIdx = t * BLOCK_SIZE + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) + { + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); + + //findBestMatch + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + for (int k = 0 ; k < BLOCK_SIZE; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } +} + +//radius_unrollmatch +__kernel void BruteForceMatch_RadiusUnrollMatch( + __global T *query, + __global T *train, + float maxDistance, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __global int *nMatches, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int bestTrainIdx_cols, + int step, + int ostep +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; + + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; + + result_type result = 0; + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i) + { + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_block(s_query, s_train, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && + convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + + if(ind < bestTrainIdx_cols) + { + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } +} + +//radius_match +__kernel void BruteForceMatch_RadiusMatch( + __global T *query, + __global T *train, + float maxDistance, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __global int *nMatches, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int bestTrainIdx_cols, + int step, + int ostep +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; + + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; + + result_type result = 0; + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i) + { + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_block(s_query, s_train, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && + convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + int ind = atom_inc(nMatches + queryIdx); + + if(ind < bestTrainIdx_cols) + { + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } +} + + +__kernel void BruteForceMatch_knnUnrollMatch( + __global T *query, + __global T *train, + //__global float *mask, + __global int2 *bestTrainIdx, + __global float2 *bestDistance, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * BLOCK_SIZE + lidy; + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; + + // load the query into local memory. + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) + { + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loopUnrolledCached + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) + { + result_type result = 0; + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) + { + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + result = DIST_RES(result); + + const int trainIdx = t * BLOCK_SIZE + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (local float *)sharebuffer; + __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); + + // find BestMatch + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < BLOCK_SIZE ; i++) + { + float val = s_distance[i]; + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < BLOCK_SIZE ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } +} + +__kernel void BruteForceMatch_knnMatch( + __global T *query, + __global T *train, + //__global float *mask, + __global int2 *bestTrainIdx, + __global float2 *bestDistance, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * BLOCK_SIZE + lidy; + __local value_type *s_query = (__local value_type *)sharebuffer; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loop + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) + { + result_type result = 0.0f; + for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++) + { + const int loadx = lidx + i * BLOCK_SIZE; + //load query and train into local memory + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + result += reduce_block_match(s_query, s_train, lidx, lidy); + + barrier(CLK_LOCAL_MEM_FENCE); + } + + result = DIST_RES(result); + + const int trainIdx = t * BLOCK_SIZE + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); + + //findBestMatch + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < BLOCK_SIZE ; i++) + { + float val = s_distance[i]; + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < BLOCK_SIZE ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } +} + +kernel void BruteForceMatch_calcDistanceUnrolled( + __global T *query, + __global T *train, + //__global float *mask, + __global float *allDist, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step) +{ + /* Todo */ +} + +kernel void BruteForceMatch_calcDistance( + __global T *query, + __global T *train, + //__global float *mask, + __global float *allDist, + __local float *sharebuffer, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step) +{ + /* Todo */ +} + +kernel void BruteForceMatch_findBestMatch( + __global float *allDist, + __global int *bestTrainIdx, + __global float *bestDistance, + int k +) +{ + /* Todo */ +} diff --git a/modules/features2d/src/precomp.hpp b/modules/features2d/src/precomp.hpp index 3c9073e..2f77d92 100644 --- a/modules/features2d/src/precomp.hpp +++ b/modules/features2d/src/precomp.hpp @@ -48,6 +48,7 @@ #include "opencv2/core/utility.hpp" #include "opencv2/core/private.hpp" +#include "opencv2/core/ocl.hpp" #include diff --git a/modules/features2d/test/ocl/test_brute_force_matcher.cpp b/modules/features2d/test/ocl/test_brute_force_matcher.cpp new file mode 100644 index 0000000..0e1df78 --- /dev/null +++ b/modules/features2d/test/ocl/test_brute_force_matcher.cpp @@ -0,0 +1,213 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Jia Haipeng, jiahaipeng95@gmail.com +// Zero Lin, Zero.Lin@amd.com +// Zhang Ying, zhangying913@gmail.com +// Yao Wang, bitwangyaoyao@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" +#include "cvconfig.h" +#include "opencv2/ts/ocl_test.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { +PARAM_TEST_CASE(BruteForceMatcher, int, int) +{ + int distType; + int dim; + + int queryDescCount; + int countFactor; + + Mat query, train; + UMat uquery, utrain; + + virtual void SetUp() + { + distType = GET_PARAM(0); + dim = GET_PARAM(1); + + queryDescCount = 300; // must be even number because we split train data in some cases in two + countFactor = 4; // do not change it + + cv::Mat queryBuf, trainBuf; + + // Generate query descriptors randomly. + // Descriptor vector elements are integer values. + queryBuf.create(queryDescCount, dim, CV_32SC1); + rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3)); + queryBuf.convertTo(queryBuf, CV_32FC1); + + // Generate train decriptors as follows: + // copy each query descriptor to train set countFactor times + // and perturb some one element of the copied descriptors in + // in ascending order. General boundaries of the perturbation + // are (0.f, 1.f). + trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1); + float step = 1.f / countFactor; + for (int qIdx = 0; qIdx < queryDescCount; qIdx++) + { + cv::Mat queryDescriptor = queryBuf.row(qIdx); + for (int c = 0; c < countFactor; c++) + { + int tIdx = qIdx * countFactor + c; + cv::Mat trainDescriptor = trainBuf.row(tIdx); + queryDescriptor.copyTo(trainDescriptor); + int elem = rng(dim); + float diff = rng.uniform(step * c, step * (c + 1)); + trainDescriptor.at(0, elem) += diff; + } + } + + queryBuf.convertTo(query, CV_32F); + trainBuf.convertTo(train, CV_32F); + query.copyTo(uquery); + train.copyTo(utrain); + } +}; + +#ifdef ANDROID +OCL_TEST_P(BruteForceMatcher, DISABLED_Match_Single) +#else +OCL_TEST_P(BruteForceMatcher, Match_Single) +#endif +{ + BFMatcher matcher(distType); + + std::vector matches; + matcher.match(uquery, utrain, matches); + + ASSERT_EQ(static_cast(queryDescCount), matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) + { + cv::DMatch match = matches[i]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) + badCount++; + } + + ASSERT_EQ(0, badCount); +} + +#ifdef ANDROID +OCL_TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single) +#else +OCL_TEST_P(BruteForceMatcher, KnnMatch_2_Single) +#endif +{ + const int knn = 2; + + BFMatcher matcher(distType); + + std::vector< std::vector > matches; + matcher.knnMatch(uquery, utrain, matches, knn); + + ASSERT_EQ(static_cast(queryDescCount), matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) + { + if ((int)matches[i].size() != knn) + badCount++; + else + { + int localBadCount = 0; + for (int k = 0; k < knn; k++) + { + cv::DMatch match = matches[i][k]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0)) + localBadCount++; + } + badCount += localBadCount > 0 ? 1 : 0; + } + } + + ASSERT_EQ(0, badCount); +} + +#ifdef ANDROID +OCL_TEST_P(BruteForceMatcher, DISABLED_RadiusMatch_Single) +#else +OCL_TEST_P(BruteForceMatcher, RadiusMatch_Single) +#endif +{ + float radius = 1.f / countFactor; + + BFMatcher matcher(distType); + + std::vector< std::vector > matches; + matcher.radiusMatch(uquery, utrain, matches, radius); + + ASSERT_EQ(static_cast(queryDescCount), matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) + { + if ((int)matches[i].size() != 1) + { + badCount++; + } + else + { + cv::DMatch match = matches[i][0]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) + badCount++; + } + } + + ASSERT_EQ(0, badCount); +} + +OCL_INSTANTIATE_TEST_CASE_P(Matcher, BruteForceMatcher, Combine( Values((int)NORM_L1, (int)NORM_L2), + Values(57, 64, 83, 128, 179, 256, 304) ) ); + +}//ocl +}//cvtest + +#endif //HAVE_OPENCL -- 2.7.4