From: yao Date: Fri, 8 Mar 2013 02:30:01 +0000 (+0800) Subject: fix mismatch on NV OCL and some AMD GPUs X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1314^2~1425^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7f0d6d42f0a22db5984e17dc38ad03e8a548d029;p=platform%2Fupstream%2Fopencv.git fix mismatch on NV OCL and some AMD GPUs --- diff --git a/modules/ocl/src/kernels/moments.cl b/modules/ocl/src/kernels/moments.cl index bd3001e..6048837 100644 --- a/modules/ocl/src/kernels/moments.cl +++ b/modules/ocl/src/kernels/moments.cl @@ -1,42 +1,56 @@ #if defined (DOUBLE_SUPPORT) + +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +typedef double T; + #else typedef float double; typedef float4 double4; +typedef long T; #define convert_double4 convert_float4 #endif //#pragma OPENCL EXTENSION cl_amd_printf:enable //#if defined (DOUBLE_SUPPORT) -__kernel void icvContourMoments(int contour_total, - __global float* reader_oclmat_data, - __global double* dst_a00, - __global double* dst_a10, - __global double* dst_a01, - __global double* dst_a20, - __global double* dst_a11, - __global double* dst_a02, - __global double* dst_a30, - __global double* dst_a21, - __global double* dst_a12, - __global double* dst_a03) +#define DST_ROW_A00 0 +#define DST_ROW_A10 1 +#define DST_ROW_A01 2 +#define DST_ROW_A20 3 +#define DST_ROW_A11 4 +#define DST_ROW_A02 5 +#define DST_ROW_A30 6 +#define DST_ROW_A21 7 +#define DST_ROW_A12 8 +#define DST_ROW_A03 9 + +__kernel void icvContourMoments(int contour_total, + __global float* reader_oclmat_data, + __global T* dst_a, + int dst_step) { - double xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1; + T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1; int idx = get_global_id(0); - xi_1 = *(reader_oclmat_data + (get_global_id(0) << 1)); - yi_1 = *(reader_oclmat_data + (get_global_id(0) << 1) + 1); + if (idx < 0 || idx >= contour_total) + return; + + xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1))); + yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1)); xi_12 = xi_1 * xi_1; yi_12 = yi_1 * yi_1; if(idx == contour_total - 1) { - xi = *(reader_oclmat_data); - yi = *(reader_oclmat_data + 1); + xi = (T)(*(reader_oclmat_data)); + yi = (T)(*(reader_oclmat_data + 1)); } else { - xi = *(reader_oclmat_data + (idx + 1) * 2); - yi = *(reader_oclmat_data + (idx + 1) * 2 + 1); + xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); + yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); } xi2 = xi * xi; @@ -44,19 +58,20 @@ __kernel void icvContourMoments(int contour_total, dxy = xi_1 * yi - xi * yi_1; xii_1 = xi_1 + xi; yii_1 = yi_1 + yi; - - dst_a00[idx] = dxy; - dst_a10[idx] = dxy * xii_1; - dst_a01[idx] = dxy * yii_1; - dst_a20[idx] = dxy * (xi_1 * xii_1 + xi2); - dst_a11[idx] = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); - dst_a02[idx] = dxy * (yi_1 * yii_1 + yi2); - dst_a30[idx] = dxy * xii_1 * (xi_12 + xi2); - dst_a03[idx] = dxy * yii_1 * (yi_12 + yi2); - dst_a21[idx] = + + dst_step /= sizeof(T); + *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; + *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; + *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1; + *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); + *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); + *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); + *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); + *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); + *( dst_a + DST_ROW_A21 * dst_step + idx) = dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + xi2 * (yi_1 + 3 * yi)); - dst_a12[idx] = + *( dst_a + DST_ROW_A12 * dst_step + idx) = dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + yi2 * (xi_1 + 3 * xi)); } diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 6979433..4abca03 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -98,25 +98,19 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) CvSeqReader reader; int lpt = contour->total; double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; - int dst_type = cv::ocl::Context::getContext()->impl->double_support ? CV_64FC1 : CV_32FC1; cvStartReadSeq( contour, &reader, 0 ); - cv::ocl::oclMat dst_a00(1,lpt,dst_type); - cv::ocl::oclMat dst_a10(1,lpt,dst_type); - cv::ocl::oclMat dst_a01(1,lpt,dst_type); - cv::ocl::oclMat dst_a20(1,lpt,dst_type); - cv::ocl::oclMat dst_a11(1,lpt,dst_type); - cv::ocl::oclMat dst_a02(1,lpt,dst_type); - cv::ocl::oclMat dst_a30(1,lpt,dst_type); - cv::ocl::oclMat dst_a21(1,lpt,dst_type); - cv::ocl::oclMat dst_a12(1,lpt,dst_type); - cv::ocl::oclMat dst_a03(1,lpt,dst_type); size_t reader_size = lpt << 1; cv::Mat reader_mat(1,reader_size,CV_32FC1); bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; + if (!cv::ocl::Context::getContext()->impl->double_support && is_float) + { + CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); + } + if( is_float ) { for(size_t i = 0; i < reader_size; ++i) @@ -136,6 +130,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) } } + cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); cv::ocl::oclMat reader_oclmat(reader_mat); int llength = std::min(lpt,128); size_t localThreads[3] = { llength, 1, 1}; @@ -143,48 +138,43 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) vector > args; args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a00.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a10.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a01.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a20.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a11.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a02.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a30.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a21.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a12.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a03.data )); - openCLExecuteKernel(dst_a00.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); - - cv::Mat dst(dst_a00); - cv::Scalar s = cv::sum(dst); - a00 = s[0]; - dst = dst_a10; - s = cv::sum(dst); - a10 = s[0];//dstsum[1]; - dst = dst_a01; - s = cv::sum(dst); - a01 = s[0];//dstsum[2]; - dst = dst_a20; - s = cv::sum(dst); - a20 = s[0];//dstsum[3]; - dst = dst_a11; - s = cv::sum(dst); - a11 = s[0];//dstsum[4]; - dst = dst_a02; - s = cv::sum(dst); - a02 = s[0];//dstsum[5]; - dst = dst_a30; - s = cv::sum(dst); - a30 = s[0];//dstsum[6]; - dst = dst_a21; - s = cv::sum(dst); - a21 = s[0];//dstsum[7]; - dst = dst_a12; - s = cv::sum(dst); - a12 = s[0];//dstsum[8]; - dst = dst_a03; - s = cv::sum(dst); - a03 = s[0];//dstsum[9]; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); + cl_int dst_step = (cl_int)dst_a.step; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); + + openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); + + cv::Mat dst(dst_a); + a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; + if (!cv::ocl::Context::getContext()->impl->double_support) + { + for (int i = 0; i < contour->total; ++i) + { + a00 += dst.at(0, i); + a10 += dst.at(1, i); + a01 += dst.at(2, i); + a20 += dst.at(3, i); + a11 += dst.at(4, i); + a02 += dst.at(5, i); + a30 += dst.at(6, i); + a21 += dst.at(7, i); + a12 += dst.at(8, i); + a03 += dst.at(9, i); + } + } + else + { + a00 = cv::sum(dst.row(0))[0]; + a10 = cv::sum(dst.row(1))[0]; + a01 = cv::sum(dst.row(2))[0]; + a20 = cv::sum(dst.row(3))[0]; + a11 = cv::sum(dst.row(4))[0]; + a02 = cv::sum(dst.row(5))[0]; + a30 = cv::sum(dst.row(6))[0]; + a21 = cv::sum(dst.row(7))[0]; + a12 = cv::sum(dst.row(8))[0]; + a03 = cv::sum(dst.row(9))[0]; + } double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; if( fabs(a00) > FLT_EPSILON )