From 7fe840307485099948a312d5fe113556fd9f76e5 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Tue, 13 Aug 2013 13:58:55 +0800 Subject: [PATCH] Let ocl::filter2D support kernel size >= 3. --- modules/ocl/src/filtering.cpp | 88 +++-- modules/ocl/src/opencl/filtering_laplacian.cl | 546 +++++++++----------------- modules/ocl/test/test_filters.cpp | 37 +- 3 files changed, 288 insertions(+), 383 deletions(-) diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index f35a26e..324bf83 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -592,20 +592,21 @@ public: } static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, - Size &ksize, const Point anchor, const int borderType) + Size &ksize, const Point anchor, const int borderType) { CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); CV_Assert((src.oclchannels() == dst.oclchannels())); - CV_Assert((borderType != 0)); CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); + CV_Assert(ksize.width == ksize.height); Context *clCxt = src.clCxt; - int cn = src.oclchannels(); - int depth = src.depth(); - string kernelName = "filter2D"; + int filterWidth = ksize.width; + bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel + + string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D"; size_t src_offset_x = (src.offset % src.step) / src.elemSize(); size_t src_offset_y = src.offset / src.step; @@ -613,44 +614,68 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize(); size_t dst_offset_y = dst.offset / dst.step; - int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4}, - {4, 4, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1}, - {4, 4, 4, 4, 1, 1, 4} - }; + int paddingPixels = (int)(filterWidth/2)*2; + + size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1}; + size_t globalThreads[3] = {src.wholecols, src.wholerows, 1}; - int vector_length = vector_lengths[cn - 1][depth]; - int offset_cols = (dst_offset_x) & (vector_length - 1); - int cols = dst.cols + offset_cols; - int rows = divUp(dst.rows, vector_length); + int cn = src.oclchannels(); + int src_step = (int)(src.step/src.elemSize()); + int dst_step = (int)(dst.step/src.elemSize()); + + int localWidth = localThreads[0] + paddingPixels; + int localHeight = localThreads[1] + paddingPixels; - size_t localThreads[3] = {256, 1, 1}; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], 1 - }; + size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize(); + + int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4}, + {4, 4, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1}, + {4, 4, 4, 4, 1, 1, 4} + }; + int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1)); vector< pair > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back(make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_step)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst_step)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(localMemSize, (void *)NULL)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y)); - args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y)); - args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)&cols)); - args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); - - const int buffer_size = 100; - char opt_buffer [buffer_size] = ""; - sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y); - - openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer); + char btype[30]; + switch (borderType) + { + case 0: + sprintf(btype, "BORDER_CONSTANT"); + break; + case 1: + sprintf(btype, "BORDER_REPLICATE"); + break; + case 2: + sprintf(btype, "BORDER_REFLECT"); + break; + case 3: + CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); + return; + case 4: + sprintf(btype, "BORDER_REFLECT_101"); + break; + } + int type = src.depth(); + char build_options[150]; + sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width); + openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options); } + Ptr cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize, Point anchor, int borderType) { @@ -659,8 +684,7 @@ Ptr cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType); oclMat gpu_krnl; - int nDivisor; - normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false); + normalizeKernel(kernel, gpu_krnl, CV_32FC1); normalizeAnchor(anchor, ksize); return Ptr(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)], diff --git a/modules/ocl/src/opencl/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl index 8535eb1..5016b0b 100644 --- a/modules/ocl/src/opencl/filtering_laplacian.cl +++ b/modules/ocl/src/opencl/filtering_laplacian.cl @@ -15,7 +15,9 @@ // Third party copyrights are property of their respective owners. // // @Authors +// Pang Erping, erping@multicorewareinc.com // Jia Haipeng, jiahaipeng95@gmail.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -42,292 +44,229 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ -#define BORDER_REFLECT_101 +//#define BORDER_REFLECT_101 /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////Macro for border type//////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// #ifdef BORDER_REPLICATE + //BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) #endif #ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) #endif #ifdef BORDER_REFLECT_101 //BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) #endif -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#ifdef IMG_C_1_0 +#define T_IMG uchar +#define T_IMGx4 uchar4 +#define T_IMG_C1 uchar +#define CONVERT_TYPE convert_uchar_sat +#define CONVERT_TYPEx4 convert_uchar4_sat +#endif +#ifdef IMG_C_4_0 +#define T_IMG uchar4 +#define T_IMGx4 uchar16 +#define T_IMG_C1 uchar +#define CONVERT_TYPE convert_uchar4_sat +#define CONVERT_TYPEx4 convert_uchar16_sat +#endif +#ifdef IMG_C_1_5 +#define T_IMG float +#define T_IMGx4 float4 +#define T_IMG_C1 float +#define CONVERT_TYPE convert_float +#define CONVERT_TYPEx4 convert_float4 +#endif +#ifdef IMG_C_4_5 +#define T_IMG float4 +#define T_IMGx4 float16 +#define T_IMG_C1 float +#define CONVERT_TYPE convert_float4 +#define CONVERT_TYPEx4 convert_float16 #endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////Macro for define elements number per thread///////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -//#define ANCHOR 3 -//#define ANX 1 -//#define ANY 1 - -#define ROWS_PER_GROUP 4 -#define ROWS_PER_GROUP_BITS 2 -#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) - -#define THREADS_PER_ROW 64 -#define THREADS_PER_ROW_BIT 6 +#ifndef CN +#define CN 1 +#endif -#define ELEMENTS_PER_THREAD 4 -#define ELEMENTS_PER_THREAD_BIT 2 +#if CN == 1 +#define T_SUM float +#define T_SUMx4 float4 +#define CONVERT_TYPE_SUM convert_float +#define CONVERT_TYPE_SUMx4 convert_float4 +#define SUM_ZERO (0.0f) +#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) +#define VLOAD4 vload4 +#define SX x +#define SY y +#define SZ z +#define SW w +#elif CN == 4 +#define T_SUM float4 +#define T_SUMx4 float16 +#define CONVERT_TYPE_SUM convert_float4 +#define CONVERT_TYPE_SUMx4 convert_float16 +#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) +#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) +#define VLOAD4 vload16 +#define SX s0123 +#define SY s4567 +#define SZ s89ab +#define SW scdef +#endif -#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 +#ifndef FILTER_SIZE +#define FILTER_SIZE 3 +#endif -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y, - __global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y, - __constant int *mat_kernel __attribute__((max_constant_size (16384))), - int cols,int rows, int operate_cols, int wholecols, int wholerows) +#define LOCAL_GROUP_SIZE 16 + +#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) +#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) + +#define FILTER_RADIUS (FILTER_SIZE >> 1) + +__kernel void filter2D( + __global T_IMG *src, + __global T_IMG *dst, + int src_step, + int dst_step, + __constant float *mat_kernel, + __local T_IMG *local_data, + int wholerows, + int wholecols, + int src_offset_x, + int src_offset_y, + int dst_offset_x, + int dst_offset_y, + int cols, + int rows, + int operate_cols +) { - int gX = get_global_id(0); - int gY = get_global_id(1); - - int lX = get_local_id(0); - - int groupX_size = get_local_size(0); - int groupX_id = get_group_id(0); - -#define dst_align (dst_offset_x & 3) - int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; - int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; - - __local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH]; - if((gY << 2) < rows) + int groupStartCol = get_group_id(0) * get_local_size(0); + int groupStartRow = get_group_id(1) * get_local_size(1); + + int localCol = get_local_id(0); + int localRow = get_local_id(1); + int globalCol = groupStartCol + localCol; + int globalRow = groupStartRow + localRow; + const int src_offset = mad24(src_offset_y, src_step, src_offset_x); + const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); +#ifdef BORDER_CONSTANT + for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) { - for(int i = 0; i < ROWS_FETCH; ++i) + int curRow = groupStartRow + i; + for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) { - if((rows_start_index - src_offset_y) + i < rows + ANY) + int curCol = groupStartCol + j; + if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| + curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) { -#ifdef BORDER_CONSTANT - int selected_row = rows_start_index + i; - int selected_cols = cols_start_index_group + lX; - - uchar data = *(src + selected_row * src_step + selected_cols); - int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX ] =data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - - data = *(src + selected_row * src_step + selected_cols); - con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; - } -#else - int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); - selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); - - int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); - selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); - - uchar data = *(src + selected_row * src_step + selected_cols); - - local_data[i * LOCAL_MEM_STEP + lX ] =data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); - - data = *(src + selected_row * src_step + selected_cols); - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; - } -#endif + local_data[(i) * LOCAL_WIDTH + j] = 0; + } + else + { + local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; } } } - barrier(CLK_LOCAL_MEM_FENCE); - - int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); - if(((gY << 2) < rows) && (process_col < operate_cols)) +#else + for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) { - int dst_cols_start = dst_offset_x; - int dst_cols_end = dst_offset_x + cols; - int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; - - int dst_rows_end = dst_offset_y + rows; - int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); + int curRow = groupStartRow + i; - uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)); + curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); - int4 sum = (int4)(0); - uchar4 data; + curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); - for(int i = 0; i < ANCHOR; i++) + for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) { -#pragma unroll - for(int j = 0; j < ANCHOR; j++) + int curCol = groupStartCol + j; + curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); + curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); + if(curRow < wholerows && curCol < wholecols) { - if(dst_rows_index < dst_rows_end) - { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); - } + local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; } } - - if(dst_rows_index < dst_rows_end) - { - sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x; - sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y; - sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z; - sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; - *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); - } } -} -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////32FC1//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y, - __global float *dst, int dst_step, int dst_offset_x, int dst_offset_y, - __constant int *mat_kernel __attribute__((max_constant_size (16384))), - int cols,int rows, int operate_cols, int wholecols, int wholerows) -{ - int gX = get_global_id(0); - int gY = get_global_id(1); - - int lX = get_local_id(0); - - int groupX_size = get_local_size(0); - int groupX_id = get_group_id(0); - -#define dst_align (dst_offset_x & 3) - int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; - int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; - - __local float local_data[LOCAL_MEM_STEP * ROWS_FETCH]; - if(((gY << 2) < rows)) +#endif + barrier(CLK_LOCAL_MEM_FENCE); + if(globalRow < rows && globalCol < cols) { - for(int i = 0; i < ROWS_FETCH; ++i) + T_SUM sum = (T_SUM)SUM_ZERO; + int filterIdx = 0; + for(int i = 0; i < FILTER_SIZE; i++) { - if((rows_start_index - src_offset_y) + i < rows + ANY) - { -#ifdef BORDER_CONSTANT - int selected_row = rows_start_index + i; - int selected_cols = cols_start_index_group + lX; - - float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); - int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX ] =data; + int offset = (i + localRow) * LOCAL_WIDTH; - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - - data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); - con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; - } -#else - int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); - selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); - - int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); - selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); - - float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); - local_data[i * LOCAL_MEM_STEP + lX] =data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); - - data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; - } -#endif + for(int j = 0; j < FILTER_SIZE; j++) + { + sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; } } + dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); } - barrier(CLK_LOCAL_MEM_FENCE); - - int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); - if(((gY << 2) < rows) && (process_col < operate_cols)) - { - int dst_cols_start = dst_offset_x; - int dst_cols_end = dst_offset_x + cols; - int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; - - int dst_rows_end = dst_offset_y + rows; - int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); +} - float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))); +/// following is specific for 3x3 kernels - float4 sum = (float4)(0); - float4 data; +////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////Macro for define elements number per thread///////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define ANX 1 +#define ANY 1 - for(int i = 0; i < ANCHOR; i++) - { -#pragma unroll - for(int j = 0; j < ANCHOR; j++) - { - if(dst_rows_index < dst_rows_end) - { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; +#define ROWS_PER_GROUP 4 +#define ROWS_PER_GROUP_BITS 2 +#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data); - } - } - } +#define THREADS_PER_ROW 64 +#define THREADS_PER_ROW_BIT 6 - if(dst_rows_index < dst_rows_end) - { - sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x; - sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y; - sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z; - sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; +#define ELEMENTS_PER_THREAD 4 +#define ELEMENTS_PER_THREAD_BIT 2 - *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; - } - } -} +#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 /////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////8uC4//////////////////////////////////////////////////////// +/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y, - __global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, - __constant int *mat_kernel __attribute__((max_constant_size (16384))), - int cols,int rows, int operate_cols, int wholecols, int wholerows) +__kernel void filter2D_3x3( + __global T_IMG *src, + __global T_IMG *dst, + int src_step, + int dst_step, + __constant float *mat_kernel, + __local T_IMG *local_data, + int wholerows, + int wholecols, + int src_offset_x, + int src_offset_y, + int dst_offset_x, + int dst_offset_y, + int cols, + int rows, + int operate_cols +) { int gX = get_global_id(0); int gY = get_global_id(1); @@ -341,9 +280,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; - __local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH]; - - if(((gY << 2) < rows)) + if((gY << 2) < rows) { for(int i = 0; i < ROWS_FETCH; ++i) { @@ -353,19 +290,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; - uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); - int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; + T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; + int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX ] =data; + local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; if(lX < (ANX << 1)) { selected_cols = cols_start_index_group + lX + groupX_size; - data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); - con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; + data = src[mad24(selected_row, src_step, selected_cols)]; + con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; + local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; } #else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); @@ -374,17 +311,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); - uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); + T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; - local_data[i * LOCAL_MEM_STEP + lX] =data; + local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; if(lX < (ANX << 1)) { selected_cols = cols_start_index_group + lX + groupX_size; selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); - data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; + data = src[mad24(selected_row, src_step, selected_cols)]; + local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; } #endif } @@ -401,131 +338,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int dst_rows_end = dst_offset_y + rows; int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); + dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); - uchar16 dst_data; - dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))); + T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; - int16 sum = (int16)(0); - uchar16 data; + T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; + T_IMGx4 data; - for(int i = 0; i < ANCHOR; i++) + for(int i = 0; i < FILTER_SIZE; i++) { #pragma unroll - for(int j = 0; j < ANCHOR; j++) + for(int j = 0; j < FILTER_SIZE; j++) { if(dst_rows_index < dst_rows_end) { int local_row = (lX >> THREADS_PER_ROW_BIT) + i; int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); + data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); + sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); } } } - if(dst_rows_index < dst_rows_end) { - uchar16 sum1 = convert_uchar16_sat(sum); - sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))? - sum1.s0123 : dst_data.s0123; - sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))? - sum1.s4567 : dst_data.s4567; - sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))? - sum1.s89ab : dst_data.s89ab; - sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))? - sum1.scdef : dst_data.scdef; - - *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1; + T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); + tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? + tmp_dst.SX : dst_data.SX; + tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? + tmp_dst.SY : dst_data.SY; + tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? + tmp_dst.SZ : dst_data.SZ; + tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? + tmp_dst.SW : dst_data.SW; + *(__global T_IMGx4 *)dst = tmp_dst; } } } -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////32FC4//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) -#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4) -__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y, - __global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, - __constant int *mat_kernel __attribute__((max_constant_size (16384))), - int cols,int rows, int operate_cols, int wholecols, int wholerows) -{ - int gX = get_global_id(0); - int gY = get_global_id(1); - - int lX = get_local_id(0); - - int groupX_size = get_local_size(0); - int groupX_id = get_group_id(0); - - int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX; - int rows_start_index = src_offset_y + gY - ANY; - - __local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4]; - if((gY < rows) && (gX < (operate_cols + ANX + ANX))) - { - for(int i = 0; i < ROWS_FETCH_C4; ++i) - { - if((rows_start_index - src_offset_y) + i < rows + ANY) - { -#ifdef BORDER_CONSTANT - int selected_row = rows_start_index + i; - int selected_cols = cols_start_index_group + lX; - - float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); - int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX ] =data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - - data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); - con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; - data = con ? data : 0; - local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; - } -#else - int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); - selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); - - int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); - selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); - - float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); - local_data[i * LOCAL_MEM_STEP_C4 + lX] =data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); - - data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); - local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; - } -#endif - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - if((gY < rows) && (gX < operate_cols)) - { - int dst_cols_index = dst_offset_x + gX; - int dst_rows_index = dst_offset_y + gY; - - float4 sum = (float4)(0); - - for(int i = 0; i < ANCHOR; i++) - { - for(int j = 0; j < ANCHOR; j++) - { - int local_cols = lX + j; - sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]); - } - } - - *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum; - } -} diff --git a/modules/ocl/test/test_filters.cpp b/modules/ocl/test/test_filters.cpp index ec46a5c..9a1264f 100644 --- a/modules/ocl/test/test_filters.cpp +++ b/modules/ocl/test/test_filters.cpp @@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat) +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Filter2D +struct Filter2D : FilterTestBase +{ + int type; + cv::Size ksize; + int bordertype; + Point anchor; + virtual void SetUp() + { + type = GET_PARAM(0); + ksize = GET_PARAM(1); + bordertype = GET_PARAM(3); + Init(type); + anchor = Point(-1,-1); + } +}; + +TEST_P(Filter2D, Mat) +{ + cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0); + for(int j = 0; j < LOOP_TIMES; j++) + { + random_roi(); + cv::filter2D(mat1_roi, dst_roi, -1, kernel, anchor, 0.0, bordertype); + cv::ocl::filter2D(gmat1, gdst, -1, kernel, anchor, bordertype); + Near(1); + } +} INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)), @@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine( Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101))); -INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine( +INSTANTIATE_TEST_CASE_P(Filter, Laplacian, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(Size(3, 3)), Values(Size(0, 0)), //not use @@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine( +INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine( + Values(CV_8UC1, CV_32FC1, CV_32FC4), + Values(Size(3, 3), Size(15, 15), Size(25, 25)), + Values(Size(0, 0)), //not use + Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REFLECT101, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT))); + #endif // HAVE_OPENCL -- 2.7.4