From 56d943388d2bd1a273b16b4958ba2f6684fc33c1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 5 Nov 2013 20:03:49 +0400 Subject: [PATCH] speeded up ocl::distanceToCenters --- modules/ocl/doc/ml_machine_learning.rst | 12 +--- modules/ocl/include/opencv2/ocl/ocl.hpp | 2 +- modules/ocl/perf/perf_imgproc.cpp | 41 +++++++------ modules/ocl/src/kmeans.cpp | 94 ++++++++++++++--------------- modules/ocl/src/opencl/kmeans_kernel.cl | 101 +++++++++++++------------------- modules/ocl/test/test_kmeans.cpp | 67 +++++++++------------ 6 files changed, 138 insertions(+), 179 deletions(-) diff --git a/modules/ocl/doc/ml_machine_learning.rst b/modules/ocl/doc/ml_machine_learning.rst index eb72cbe..ad0e303 100644 --- a/modules/ocl/doc/ml_machine_learning.rst +++ b/modules/ocl/doc/ml_machine_learning.rst @@ -91,11 +91,7 @@ ocl::distanceToCenters ---------------------- For each samples in ``source``, find its closest neighour in ``centers``. -.. ocv:function:: void ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType = NORM_L2SQR, const oclMat &indices = oclMat()) - - :param dists: The output distances calculated from each sample to the best matched center. - - :param labels: The output index of best matched center for each row of sample. +.. ocv:function:: void ocl::distanceToCenters(const oclMat &src, const oclMat ¢ers, Mat &dists, Mat &labels, int distType = NORM_L2SQR) :param src: Floating-point matrix of input samples. One row per sample. @@ -103,10 +99,8 @@ For each samples in ``source``, find its closest neighour in ``centers``. :param distType: Distance metric to calculate distances. Supports ``NORM_L1`` and ``NORM_L2SQR``. - :param indices: Optional source indices. If not empty: + :param dists: The output distances calculated from each sample to the best matched center. - * only the indexed source samples will be processed - * outputs, i.e., ``dists`` and ``labels``, have the same size of indices - * outputs are in the same order of indices instead of the order of src + :param labels: The output index of best matched center for each row of sample. The method is a utility function which maybe used for multiple clustering algorithms such as K-means. diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 4070ed1..3b4219a 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -878,7 +878,7 @@ namespace cv // supports NORM_L1 and NORM_L2 distType // if indices is provided, only the indexed rows will be calculated and their results are in the same // order of indices - CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType = NORM_L2SQR, const oclMat &indices = oclMat()); + CV_EXPORTS void distanceToCenters(const oclMat &src, const oclMat ¢ers, Mat &dists, Mat &labels, int distType = NORM_L2SQR); //!Does k-means procedure on GPU // supports CV_32FC1/CV_32FC2/CV_32FC4 data type diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp index c57950f..8c12fb4 100644 --- a/modules/ocl/perf/perf_imgproc.cpp +++ b/modules/ocl/perf/perf_imgproc.cpp @@ -872,58 +872,57 @@ PERF_TEST_P(columnSumFixture, columnSum, OCL_TYPICAL_MAT_SIZES) //////////////////////////////distanceToCenters//////////////////////////////////////////////// -CV_ENUM(DistType, NORM_L1, NORM_L2SQR); +CV_ENUM(DistType, NORM_L1, NORM_L2SQR) + typedef tuple distanceToCentersParameters; typedef TestBaseWithParam distanceToCentersFixture; static void distanceToCentersPerfTest(Mat& src, Mat& centers, Mat& dists, Mat& labels, int distType) { Mat batch_dists; - cv::batchDistance(src,centers,batch_dists, CV_32FC1, noArray(), distType); + cv::batchDistance(src, centers, batch_dists, CV_32FC1, noArray(), distType); + std::vector dists_v; std::vector labels_v; - for(int i = 0; i(mVal)); labels_v.push_back(mLoc.x); } - Mat temp_dists(dists_v); - Mat temp_labels(labels_v); - temp_dists.reshape(1,1).copyTo(dists); - temp_labels.reshape(1,1).copyTo(labels); + + Mat(dists_v).copyTo(dists); + Mat(labels_v).copyTo(labels); } PERF_TEST_P(distanceToCentersFixture, distanceToCenters, ::testing::Combine(::testing::Values(cv::Size(256,256), cv::Size(512,512)), DistType::all()) ) { Size size = get<0>(GetParam()); int distType = get<1>(GetParam()); - Mat src(size, CV_32FC1); - Mat centers(size, CV_32FC1); - Mat dists(cv::Size(src.rows,1), CV_32FC1); - Mat labels(cv::Size(src.rows,1), CV_32SC1); + + Mat src(size, CV_32FC1), centers(size, CV_32FC1); + Mat dists(src.rows, 1, CV_32FC1), labels(src.rows, 1, CV_32SC1); + declare.in(src, centers, WARMUP_RNG).out(dists, labels); + if (RUN_OCL_IMPL) { - ocl::oclMat ocl_src(src); - ocl::oclMat ocl_centers(centers); - ocl::oclMat ocl_dists(dists); - ocl::oclMat ocl_labels(labels); + ocl::oclMat ocl_src(src), ocl_centers(centers); - OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src, ocl_centers, distType); - - ocl_dists.download(dists); - ocl_labels.download(labels); + OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_src, ocl_centers, dists, labels, distType); SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE); SANITY_CHECK(labels); } else if (RUN_PLAIN_IMPL) { - TEST_CYCLE() distanceToCentersPerfTest(src,centers,dists,labels,distType); + TEST_CYCLE() distanceToCentersPerfTest(src, centers, dists, labels, distType); + SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE); SANITY_CHECK(labels); } diff --git a/modules/ocl/src/kmeans.cpp b/modules/ocl/src/kmeans.cpp index 58a68a7..31fb250 100644 --- a/modules/ocl/src/kmeans.cpp +++ b/modules/ocl/src/kmeans.cpp @@ -160,63 +160,66 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers, } } -void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType, const oclMat &indices) +void cv::ocl::distanceToCenters(const oclMat &src, const oclMat ¢ers, Mat &dists, Mat &labels, int distType) { - CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels()); + CV_Assert(src.cols * src.channels() == centers.cols * centers.channels()); CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F); - bool is_label_row_major = false; - ensureSizeIsEnough(1, src.rows, CV_32FC1, dists); - if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1)) - { - ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels); - is_label_row_major = true; - } CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR); - std::stringstream build_opt_ss; - build_opt_ss - << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST") - << (indices.empty() ? "" : " -D USE_INDEX"); - - String build_opt = build_opt_ss.str(); + dists.create(src.rows, 1, CV_32FC1); + labels.create(src.rows, 1, CV_32SC1); - const int src_step = (int)(src.oclchannels() * src.step / src.elemSize()); - const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize()); - - const int colsNumb = centers.cols*centers.oclchannels(); - - const int label_step = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1; - String kernelname = "distanceToCenters"; - - const int number_of_input = indices.empty() ? src.rows : indices.size().area(); + std::stringstream build_opt_ss; + build_opt_ss << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST"); - const int src_offset = (int)src.offset/src.elemSize(); - const int centers_offset = (int)centers.offset/centers.elemSize(); + int src_step = src.step / src.elemSize1(); + int centers_step = centers.step / centers.elemSize1(); + int feature_width = centers.cols * centers.oclchannels(); + int src_offset = src.offset / src.elemSize1(); + int centers_offset = centers.offset / centers.elemSize1(); - size_t globalThreads[3] = {number_of_input, 1, 1}; + int all_dist_count = src.rows * centers.rows; + oclMat all_dist(1, all_dist_count, CV_32FC1); vector > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)¢ers.data)); - if(!indices.empty()) - { - args.push_back(make_pair(sizeof(cl_mem), (void *)&indices.data)); - } - args.push_back(make_pair(sizeof(cl_mem), (void *)&labels.data)); - args.push_back(make_pair(sizeof(cl_mem), (void *)&dists.data)); - args.push_back(make_pair(sizeof(cl_int), (void *)&colsNumb)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&all_dist.data)); + + args.push_back(make_pair(sizeof(cl_int), (void *)&feature_width)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_step)); args.push_back(make_pair(sizeof(cl_int), (void *)¢ers_step)); - args.push_back(make_pair(sizeof(cl_int), (void *)&label_step)); - args.push_back(make_pair(sizeof(cl_int), (void *)&number_of_input)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)¢ers.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset)); args.push_back(make_pair(sizeof(cl_int), (void *)¢ers_offset)); + size_t globalThreads[3] = { all_dist_count, 1, 1 }; + openCLExecuteKernel(Context::getContext(), &kmeans_kernel, - kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str()); + "distanceToCenters", globalThreads, NULL, args, -1, -1, build_opt_ss.str().c_str()); + + Mat all_dist_cpu; + all_dist.download(all_dist_cpu); + + for (int i = 0; i < src.rows; ++i) + { + Point p; + double minVal; + + Rect roi(i * centers.rows, 0, centers.rows, 1); + Mat hdr(all_dist_cpu, roi); + + cv::minMaxLoc(hdr, &minVal, NULL, &p); + + dists.at(i, 0) = static_cast(minVal); + labels.at(i, 0) = p.x; + } } + ///////////////////////////////////k - means ///////////////////////////////////////////////////////// + double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels, TermCriteria criteria, int attempts, int flags, oclMat &_centers) { @@ -429,28 +432,19 @@ double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels, break; // assign labels - oclMat _dists(1, N, CV_64F); - - _bestLabels.upload(_labels); + Mat dists(1, N, CV_64F); _centers.upload(centers); + distanceToCenters(_src, _centers, dists, _labels); + _bestLabels.upload(_labels); - distanceToCenters(_dists, _bestLabels, _src, _centers); - - Mat dists; - _dists.download(dists); - _bestLabels.download(_labels); float* dist = dists.ptr(0); compactness = 0; for( i = 0; i < N; i++ ) - { - compactness += (double)dist[i]; - } + compactness += (double)dist[i]; } if( compactness < best_compactness ) - { best_compactness = compactness; - } } return best_compactness; diff --git a/modules/ocl/src/opencl/kmeans_kernel.cl b/modules/ocl/src/opencl/kmeans_kernel.cl index f62a08f..244d52c 100644 --- a/modules/ocl/src/opencl/kmeans_kernel.cl +++ b/modules/ocl/src/opencl/kmeans_kernel.cl @@ -44,81 +44,64 @@ // //M*/ +static float distance_(__global const float * center, __global const float * src, int feature_length) +{ + float res = 0; + float4 v0, v1, v2; + int i = 0; + #ifdef L1_DIST -# define DISTANCE(A, B) fabs((A) - (B)) -#elif defined L2SQR_DIST -# define DISTANCE(A, B) ((A) - (B)) * ((A) - (B)) -#else -# define DISTANCE(A, B) ((A) - (B)) * ((A) - (B)) + float4 sum = (float4)(0.0f); #endif -inline float dist(__global const float * center, __global const float * src, int feature_cols) -{ - float res = 0; - float4 tmp4; - int i; - for(i = 0; i < feature_cols / 4; i += 4, center += 4, src += 4) + for ( ; i <= feature_length - 4; i += 4) { - tmp4 = vload4(0, center) - vload4(0, src); + v0 = vload4(0, center + i); + v1 = vload4(0, src + i); + v2 = v1 - v0; #ifdef L1_DIST - tmp4 = fabs(tmp4); + v0 = fabs(v2); + sum += v0; #else - tmp4 *= tmp4; + res += dot(v2, v2); #endif - res += tmp4.x + tmp4.y + tmp4.z + tmp4.w; } - for(; i < feature_cols; ++i, ++center, ++src) +#ifdef L1_DIST + res = sum.x + sum.y + sum.z + sum.w; +#endif + + for ( ; i < feature_length; ++i) { - res += DISTANCE(*src, *center); + float t0 = src[i]; + float t1 = center[i]; +#ifdef L1_DIST + res += fabs(t0 - t1); +#else + float t2 = t0 - t1; + res += t2 * t2; +#endif } + return res; } -// to be distinguished with distanceToCenters in kmeans_kernel.cl -__kernel void distanceToCenters( - __global const float *src, - __global const float *centers, -#ifdef USE_INDEX - __global const int *indices, -#endif - __global int *labels, - __global float *dists, - int feature_cols, - int src_step, - int centers_step, - int label_step, - int input_size, - int K, - int offset_src, - int offset_centers -) +__kernel void distanceToCenters(__global const float * src, __global const float * centers, + __global float * dists, int feature_length, + int src_step, int centers_step, + int features_count, int centers_count, + int src_offset, int centers_offset) { int gid = get_global_id(0); - float euDist, minval; - int minCentroid; - if(gid >= input_size) - { - return; - } - src += offset_src; - centers += offset_centers; -#ifdef USE_INDEX - src += indices[gid] * src_step; -#else - src += gid * src_step; -#endif - minval = dist(centers, src, feature_cols); - minCentroid = 0; - for(int i = 1 ; i < K; i++) + + if (gid < (features_count * centers_count)) { - euDist = dist(centers + i * centers_step, src, feature_cols); - if(euDist < minval) - { - minval = euDist; - minCentroid = i; - } + int feature_index = gid / centers_count; + int center_index = gid % centers_count; + + int center_idx = mad24(center_index, centers_step, centers_offset); + int src_idx = mad24(feature_index, src_step, src_offset); + + dists[gid] = distance_(centers + center_idx, src + src_idx, feature_length); } - labels[gid * label_step] = minCentroid; - dists[gid] = minval; } diff --git a/modules/ocl/test/test_kmeans.cpp b/modules/ocl/test/test_kmeans.cpp index 6539c51..e1d0a17 100644 --- a/modules/ocl/test/test_kmeans.cpp +++ b/modules/ocl/test/test_kmeans.cpp @@ -61,7 +61,7 @@ PARAM_TEST_CASE(Kmeans, int, int, int) int type; int K; int flags; - cv::Mat src ; + Mat src ; ocl::oclMat d_src, d_dists; Mat labels, centers; @@ -73,7 +73,7 @@ PARAM_TEST_CASE(Kmeans, int, int, int) flags = GET_PARAM(2); // MWIDTH=256, MHEIGHT=256. defined in utility.hpp - cv::Size size = cv::Size(MWIDTH, MHEIGHT); + Size size = Size(MWIDTH, MHEIGHT); src.create(size, type); int row_idx = 0; const int max_neighbour = MHEIGHT / K - 1; @@ -159,15 +159,15 @@ INSTANTIATE_TEST_CASE_P(OCL_ML, Kmeans, Combine( /////////////////////////////// DistanceToCenters ////////////////////////////////////////// -CV_ENUM(DistType, NORM_L1, NORM_L2SQR); +CV_ENUM(DistType, NORM_L1, NORM_L2SQR) PARAM_TEST_CASE(distanceToCenters, DistType, bool) { - cv::Size size; int distType; bool useRoi; - cv::Mat src, centers, src_roi, centers_roi; - cv::ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi; + + Mat src, centers, src_roi, centers_roi; + ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi; virtual void SetUp() { @@ -177,70 +177,59 @@ PARAM_TEST_CASE(distanceToCenters, DistType, bool) void random_roi() { - Size roiSize_src = randomSize(10,1000); - Size roiSize_centers = randomSize(10, 1000); - roiSize_src.width = roiSize_centers.width; + Size roiSizeSrc = randomSize(1, MAX_VALUE); + Size roiSizeCenters = randomSize(1, MAX_VALUE); + roiSizeSrc.width = roiSizeCenters.width; - Border srcBorder = randomBorder(0, useRoi ? 500 : 0); - randomSubMat(src, src_roi, roiSize_src, srcBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX); + Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSizeSrc, srcBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE); Border centersBorder = randomBorder(0, useRoi ? 500 : 0); - randomSubMat(centers, centers_roi, roiSize_centers, centersBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX); - - for(int i = 0; i(i, randomInt(0,centers.cols-1)) = (float)randomDouble(SHRT_MAX, INT_MAX); + randomSubMat(centers, centers_roi, roiSizeCenters, centersBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE); - generateOclMat(ocl_src, ocl_src_roi, src, roiSize_src, srcBorder); - generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSize_centers, centersBorder); + for (int i = 0; i < centers.rows; i++) + centers.at(i, randomInt(0, centers.cols)) = (float)randomDouble(SHRT_MAX, INT_MAX); + generateOclMat(ocl_src, ocl_src_roi, src, roiSizeSrc, srcBorder); + generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSizeCenters, centersBorder); } - }; OCL_TEST_P(distanceToCenters, Accuracy) { - for(int j = 0; j< LOOP_TIMES; j++) + for (int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::ocl::oclMat ocl_dists; - cv::ocl::oclMat ocl_labels; - - cv::ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src_roi, ocl_centers_roi, distType); - Mat labels, dists; - ocl_labels.download(labels); - ocl_dists.download(dists); + ocl::distanceToCenters(ocl_src_roi, ocl_centers_roi, dists, labels, distType); - ASSERT_EQ(ocl_dists.cols, ocl_labels.rows); + EXPECT_EQ(dists.size(), labels.size()); Mat batch_dists; - cv::batchDistance(src_roi, centers_roi, batch_dists, CV_32FC1, noArray(), distType); - std::vector gold_dists_v; + std::vector gold_dists_v; + gold_dists_v.reserve(batch_dists.rows); - for(int i = 0; i(i, 0); + EXPECT_EQ(mLoc.x, ocl_label); - gold_dists_v.push_back(mVal); + gold_dists_v.push_back(static_cast(mVal)); } - Mat gold_dists(gold_dists_v); - dists.convertTo(dists, CV_64FC1); - double relative_error = cv::norm(gold_dists.t(), dists, NORM_INF|NORM_RELATIVE); + + double relative_error = cv::norm(Mat(gold_dists_v), dists, NORM_INF | NORM_RELATIVE); ASSERT_LE(relative_error, 1e-5); } } - -INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool()) ); - +INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool())); #endif -- 2.7.4