From b4ad12821861c3527ed06755b0959ebbeab956b9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Sep 2013 14:07:54 +0400 Subject: [PATCH] refactoed and extended ocl::transpose --- modules/ocl/src/arithm.cpp | 72 +++-- modules/ocl/src/opencl/arithm_transpose.cl | 478 ++--------------------------- 2 files changed, 66 insertions(+), 484 deletions(-) diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 97da8c0..b18fa44 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -1511,57 +1511,51 @@ oclMatExpr::operator oclMat() const ////////////////////////////////////////////////////////////////////////////// /////////////////////////////// transpose //////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// + #define TILE_DIM (32) #define BLOCK_ROWS (256/TILE_DIM) + static void transpose_run(const oclMat &src, oclMat &dst, string kernelName) { - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + Context *clCxt = src.clCxt; + if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); + CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; } - CV_Assert(src.cols == dst.rows && src.rows == dst.cols); - - Context *clCxt = src.clCxt; - int channels = src.oclchannels(); - int depth = src.depth(); - - int vector_lengths[4][7] = {{1, 0, 0, 0, 1, 1, 0}, - {0, 0, 1, 1, 0, 0, 0}, - {0, 0, 0, 0 , 0, 0, 0}, - {1, 1, 0, 0, 0, 0, 0} - }; - - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1); - int cols = divUp(src.cols + offset_cols, vector_length); + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + const char channelsString[] = { ' ', ' ', '2', '4', '4' }; + std::string buildOptions = format("-D T=%s%c", typeMap[src.depth()], + channelsString[src.channels()]); size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 }; - size_t globalThreads[3] = { cols, src.rows, 1 }; + size_t globalThreads[3] = { src.cols, src.rows, 1 }; + + int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize(); + int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcoffset1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 )); - openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads, args, channels, depth); + openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads, + args, -1, -1, buildOptions.c_str()); } void cv::ocl::transpose(const oclMat &src, oclMat &dst) { - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4 || src.type() == CV_8SC3 || src.type() == CV_8SC4 || - src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1); - - oclMat emptyMat; + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - if( src.data == dst.data && dst.cols == dst.rows ) - transpose_run( src, emptyMat, "transposeI_"); + if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset + && dst.rows == dst.cols && src.cols == dst.cols) + transpose_run( src, dst, "transpose_inplace"); else { dst.create(src.cols, src.rows, src.type()); @@ -1569,6 +1563,10 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst) } } +////////////////////////////////////////////////////////////////////////////// +////////////////////////////// addWeighted /////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst) { Context *clCxt = src1.clCxt; @@ -1633,6 +1631,10 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, args, -1, -1, buildOptions.c_str()); } +////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////// Pow ////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString) { CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows); @@ -1671,6 +1673,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); } + void cv::ocl::pow(const oclMat &x, double p, oclMat &y) { if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F) @@ -1685,6 +1688,11 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y) arithmetic_pow_run(x, p, y, kernelName, &arithm_pow); } + +////////////////////////////////////////////////////////////////////////////// +/////////////////////////////// setIdentity ////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + void cv::ocl::setIdentity(oclMat& src, double scalar) { CV_Assert(src.empty() == false && src.rows == src.cols); @@ -1711,7 +1719,6 @@ void cv::ocl::setIdentity(oclMat& src, double scalar) } - vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); @@ -1735,7 +1742,8 @@ void cv::ocl::setIdentity(oclMat& src, double scalar) { scalar_i = (int)scalar; args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i)); - }else + } + else { scalar_f = (float)scalar; args.push_back(make_pair(sizeof(cl_float), (void*)&scalar_f)); diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl index d0725b0..57f7f1b 100644 --- a/modules/ocl/src/opencl/arithm_transpose.cl +++ b/modules/ocl/src/opencl/arithm_transpose.cl @@ -43,468 +43,42 @@ // //M*/ -#define TILE_DIM 32 -#define BLOCK_ROWS 8 -#define LDS_STEP (TILE_DIM + 1) - - -//8UC1 is not unoptimized, as the size of write per thread is 8 -//which will use completepath -__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset, - __global uchar* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) -{ - - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local uchar title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, x); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] =*(src + src_offset + index_src); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, x_index); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } - } -} - -__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset, - __global int* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) -{ - - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local int title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global int *)((__global char*)src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } - } -} -__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset, - __global float* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) +#if defined (DOUBLE_SUPPORT) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +__kernel void transpose(__global const T* src, __global T* dst, + int src_cols, int src_rows, + int src_step, int dst_step, + int src_offset, int dst_offset) { + int x = get_global_id(0); + int y = get_global_id(1); - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else + if (x < src_cols && y < src_rows) { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; + int srcIdx = mad24(y, src_step, src_offset + x); + int dstIdx = mad24(x, dst_step, dst_offset + y); - __local float title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global float *)((__global char*)src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } + dst[dstIdx] = src[srcIdx]; } } -__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset, - __global ushort* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) +__kernel void transpose_inplace(__global T* src, __global T* dst, + int src_cols, int src_rows, + int src_step, int dst_step, + int src_offset, int dst_offset) { + int x = get_global_id(0); + int y = get_global_id(1); - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local ushort2 title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global ushort2 *)((__global char*)src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } - } -} -__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset, - __global short* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) -{ - - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local short2 title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global short2 *)((__global char*)src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } - } -} -__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset, - __global uchar* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) -{ - - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local uchar4 title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global uchar4 *)(src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) - { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } - } -} - -__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset, - __global char* dst, int dst_step, int dst_offset, - int src_rows, int src_cols) -{ - - int gp_x = get_group_id(0), gp_y = get_group_id(1); - int gs_x = get_num_groups(0), gs_y = get_num_groups(1); - - int groupId_x, groupId_y; - - if(src_rows == src_cols) - { - groupId_y = gp_x; - groupId_x = (gp_x + gp_y) % gs_x; - } - else - { - int bid = gp_x + gs_x * gp_y; - groupId_y = bid % gs_y; - groupId_x = ((bid / gs_y) + groupId_y) % gs_x; - } - - int lx = get_local_id(0); - int ly = get_local_id(1); - - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; - - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; - - __local char4 title[TILE_DIM * LDS_STEP]; - - if(x < src_cols && y < src_rows) - { - int index_src = mad24(y, src_step, (x << 2)); - - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if(y + i < src_rows) - { - title[(ly + i) * LDS_STEP + lx] = *((__global char4 *)(src + src_offset + index_src)); - index_src = mad24(BLOCK_ROWS, src_step, index_src); - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(x_index < src_rows && y_index < src_cols) + if (x < src_cols && y < src_rows && x < y) { - int index_dst = mad24(y_index, dst_step, (x_index << 2)); + int srcIdx = mad24(y, src_step, src_offset + x); + int dstIdx = mad24(x, dst_step, dst_offset + y); - #pragma unroll - for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if((y_index + i) < src_cols) - { - *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i]; - index_dst += dst_step * BLOCK_ROWS ; - } - } + T tmp = dst[dstIdx]; + dst[dstIdx] = src[srcIdx]; + src[srcIdx] = tmp; } } -- 2.7.4