}
static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
- Size &ksize, const Point anchor, const int borderType)
+ Size &ksize, const Point anchor, const int borderType)
{
CV_Assert(src.clCxt == dst.clCxt);
CV_Assert((src.cols == dst.cols) &&
(src.rows == dst.rows));
CV_Assert((src.oclchannels() == dst.oclchannels()));
- CV_Assert((borderType != 0));
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
+ CV_Assert(ksize.width == ksize.height);
Context *clCxt = src.clCxt;
- int cn = src.oclchannels();
- int depth = src.depth();
- string kernelName = "filter2D";
+ int filterWidth = ksize.width;
+ bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
+
+ string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
size_t src_offset_x = (src.offset % src.step) / src.elemSize();
size_t src_offset_y = src.offset / src.step;
size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
size_t dst_offset_y = dst.offset / dst.step;
- int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
- {4, 4, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1},
- {4, 4, 4, 4, 1, 1, 4}
- };
+ int paddingPixels = (int)(filterWidth/2)*2;
+
+ size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
+ size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
- int vector_length = vector_lengths[cn - 1][depth];
- int offset_cols = (dst_offset_x) & (vector_length - 1);
- int cols = dst.cols + offset_cols;
- int rows = divUp(dst.rows, vector_length);
+ int cn = src.oclchannels();
+ int src_step = (int)(src.step/src.elemSize());
+ int dst_step = (int)(dst.step/src.elemSize());
+
+ int localWidth = localThreads[0] + paddingPixels;
+ int localHeight = localThreads[1] + paddingPixels;
- size_t localThreads[3] = {256, 1, 1};
- size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
- divUp(rows, localThreads[1]) *localThreads[1], 1
- };
+ size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
+
+ int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
+ {4, 4, 1, 1, 1, 1, 1},
+ {1, 1, 1, 1, 1, 1, 1},
+ {4, 4, 4, 4, 1, 1, 4}
+ };
+ int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
vector< pair<size_t, const void *> > args;
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
- args.push_back(make_pair(sizeof(cl_int), (void *)&src.step));
+ args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
+ args.push_back(make_pair(sizeof(cl_int), (void *)&src_step));
+ args.push_back(make_pair(sizeof(cl_int), (void *)&dst_step));
+ args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
+ args.push_back(make_pair(localMemSize, (void *)NULL));
+ args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
+ args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x));
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y));
- args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
- args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y));
- args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
- args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
- args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
-
- const int buffer_size = 100;
- char opt_buffer [buffer_size] = "";
- sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y);
-
- openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer);
+ char btype[30];
+ switch (borderType)
+ {
+ case 0:
+ sprintf(btype, "BORDER_CONSTANT");
+ break;
+ case 1:
+ sprintf(btype, "BORDER_REPLICATE");
+ break;
+ case 2:
+ sprintf(btype, "BORDER_REFLECT");
+ break;
+ case 3:
+ CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+ return;
+ case 4:
+ sprintf(btype, "BORDER_REFLECT_101");
+ break;
+ }
+ int type = src.depth();
+ char build_options[150];
+ sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
+ openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
+
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
Point anchor, int borderType)
{
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
oclMat gpu_krnl;
- int nDivisor;
- normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false);
+ normalizeKernel(kernel, gpu_krnl, CV_32FC1);
normalizeAnchor(anchor, ksize);
return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
// Third party copyrights are property of their respective owners.
//
// @Authors
+// Pang Erping, erping@multicorewareinc.com
// Jia Haipeng, jiahaipeng95@gmail.com
+// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
-#define BORDER_REFLECT_101
+//#define BORDER_REFLECT_101
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
+
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#ifdef IMG_C_1_0
+#define T_IMG uchar
+#define T_IMGx4 uchar4
+#define T_IMG_C1 uchar
+#define CONVERT_TYPE convert_uchar_sat
+#define CONVERT_TYPEx4 convert_uchar4_sat
+#endif
+#ifdef IMG_C_4_0
+#define T_IMG uchar4
+#define T_IMGx4 uchar16
+#define T_IMG_C1 uchar
+#define CONVERT_TYPE convert_uchar4_sat
+#define CONVERT_TYPEx4 convert_uchar16_sat
+#endif
+#ifdef IMG_C_1_5
+#define T_IMG float
+#define T_IMGx4 float4
+#define T_IMG_C1 float
+#define CONVERT_TYPE convert_float
+#define CONVERT_TYPEx4 convert_float4
+#endif
+#ifdef IMG_C_4_5
+#define T_IMG float4
+#define T_IMGx4 float16
+#define T_IMG_C1 float
+#define CONVERT_TYPE convert_float4
+#define CONVERT_TYPEx4 convert_float16
#endif
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////Macro for define elements number per thread/////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//#define ANCHOR 3
-//#define ANX 1
-//#define ANY 1
-
-#define ROWS_PER_GROUP 4
-#define ROWS_PER_GROUP_BITS 2
-#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
-
-#define THREADS_PER_ROW 64
-#define THREADS_PER_ROW_BIT 6
+#ifndef CN
+#define CN 1
+#endif
-#define ELEMENTS_PER_THREAD 4
-#define ELEMENTS_PER_THREAD_BIT 2
+#if CN == 1
+#define T_SUM float
+#define T_SUMx4 float4
+#define CONVERT_TYPE_SUM convert_float
+#define CONVERT_TYPE_SUMx4 convert_float4
+#define SUM_ZERO (0.0f)
+#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
+#define VLOAD4 vload4
+#define SX x
+#define SY y
+#define SZ z
+#define SW w
+#elif CN == 4
+#define T_SUM float4
+#define T_SUMx4 float16
+#define CONVERT_TYPE_SUM convert_float4
+#define CONVERT_TYPE_SUMx4 convert_float16
+#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
+#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
+#define VLOAD4 vload16
+#define SX s0123
+#define SY s4567
+#define SZ s89ab
+#define SW scdef
+#endif
-#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
+#ifndef FILTER_SIZE
+#define FILTER_SIZE 3
+#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
- __global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
- __constant int *mat_kernel __attribute__((max_constant_size (16384))),
- int cols,int rows, int operate_cols, int wholecols, int wholerows)
+#define LOCAL_GROUP_SIZE 16
+
+#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
+#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
+
+#define FILTER_RADIUS (FILTER_SIZE >> 1)
+
+__kernel void filter2D(
+ __global T_IMG *src,
+ __global T_IMG *dst,
+ int src_step,
+ int dst_step,
+ __constant float *mat_kernel,
+ __local T_IMG *local_data,
+ int wholerows,
+ int wholecols,
+ int src_offset_x,
+ int src_offset_y,
+ int dst_offset_x,
+ int dst_offset_y,
+ int cols,
+ int rows,
+ int operate_cols
+)
{
- int gX = get_global_id(0);
- int gY = get_global_id(1);
-
- int lX = get_local_id(0);
-
- int groupX_size = get_local_size(0);
- int groupX_id = get_group_id(0);
-
-#define dst_align (dst_offset_x & 3)
- int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
- int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
-
- __local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
- if((gY << 2) < rows)
+ int groupStartCol = get_group_id(0) * get_local_size(0);
+ int groupStartRow = get_group_id(1) * get_local_size(1);
+
+ int localCol = get_local_id(0);
+ int localRow = get_local_id(1);
+ int globalCol = groupStartCol + localCol;
+ int globalRow = groupStartRow + localRow;
+ const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
+ const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
+#ifdef BORDER_CONSTANT
+ for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
- for(int i = 0; i < ROWS_FETCH; ++i)
+ int curRow = groupStartRow + i;
+ for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
- if((rows_start_index - src_offset_y) + i < rows + ANY)
+ int curCol = groupStartCol + j;
+ if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
+ curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
{
-#ifdef BORDER_CONSTANT
- int selected_row = rows_start_index + i;
- int selected_cols = cols_start_index_group + lX;
-
- uchar data = *(src + selected_row * src_step + selected_cols);
- int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX ] =data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
-
- data = *(src + selected_row * src_step + selected_cols);
- con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
- }
-#else
- int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
- selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
-
- int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
- selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
-
- uchar data = *(src + selected_row * src_step + selected_cols);
-
- local_data[i * LOCAL_MEM_STEP + lX ] =data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
- selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
-
- data = *(src + selected_row * src_step + selected_cols);
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
- }
-#endif
+ local_data[(i) * LOCAL_WIDTH + j] = 0;
+ }
+ else
+ {
+ local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
}
}
}
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
- if(((gY << 2) < rows) && (process_col < operate_cols))
+#else
+ for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
- int dst_cols_start = dst_offset_x;
- int dst_cols_end = dst_offset_x + cols;
- int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
-
- int dst_rows_end = dst_offset_y + rows;
- int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+ int curRow = groupStartRow + i;
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index));
+ curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
- int4 sum = (int4)(0);
- uchar4 data;
+ curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
- for(int i = 0; i < ANCHOR; i++)
+ for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
-#pragma unroll
- for(int j = 0; j < ANCHOR; j++)
+ int curCol = groupStartCol + j;
+ curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
+ curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
+ if(curRow < wholerows && curCol < wholecols)
{
- if(dst_rows_index < dst_rows_end)
- {
- int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
- int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
-
- data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
- sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
- }
+ local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
}
}
-
- if(dst_rows_index < dst_rows_end)
- {
- sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
- sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
- sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
- sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
- *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
- }
}
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
- __global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
- __constant int *mat_kernel __attribute__((max_constant_size (16384))),
- int cols,int rows, int operate_cols, int wholecols, int wholerows)
-{
- int gX = get_global_id(0);
- int gY = get_global_id(1);
-
- int lX = get_local_id(0);
-
- int groupX_size = get_local_size(0);
- int groupX_id = get_group_id(0);
-
-#define dst_align (dst_offset_x & 3)
- int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
- int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
-
- __local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
- if(((gY << 2) < rows))
+#endif
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(globalRow < rows && globalCol < cols)
{
- for(int i = 0; i < ROWS_FETCH; ++i)
+ T_SUM sum = (T_SUM)SUM_ZERO;
+ int filterIdx = 0;
+ for(int i = 0; i < FILTER_SIZE; i++)
{
- if((rows_start_index - src_offset_y) + i < rows + ANY)
- {
-#ifdef BORDER_CONSTANT
- int selected_row = rows_start_index + i;
- int selected_cols = cols_start_index_group + lX;
-
- float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
- int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX ] =data;
+ int offset = (i + localRow) * LOCAL_WIDTH;
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
-
- data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
- con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
- }
-#else
- int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
- selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
-
- int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
- selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
-
- float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
- local_data[i * LOCAL_MEM_STEP + lX] =data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
- selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
-
- data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
- }
-#endif
+ for(int j = 0; j < FILTER_SIZE; j++)
+ {
+ sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
}
}
+ dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
}
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
- if(((gY << 2) < rows) && (process_col < operate_cols))
- {
- int dst_cols_start = dst_offset_x;
- int dst_cols_end = dst_offset_x + cols;
- int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
-
- int dst_rows_end = dst_offset_y + rows;
- int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+}
- float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
+/// following is specific for 3x3 kernels
- float4 sum = (float4)(0);
- float4 data;
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////Macro for define elements number per thread/////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define ANX 1
+#define ANY 1
- for(int i = 0; i < ANCHOR; i++)
- {
-#pragma unroll
- for(int j = 0; j < ANCHOR; j++)
- {
- if(dst_rows_index < dst_rows_end)
- {
- int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
- int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
+#define ROWS_PER_GROUP 4
+#define ROWS_PER_GROUP_BITS 2
+#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
- data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
- sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
- }
- }
- }
+#define THREADS_PER_ROW 64
+#define THREADS_PER_ROW_BIT 6
- if(dst_rows_index < dst_rows_end)
- {
- sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
- sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
- sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
- sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
+#define ELEMENTS_PER_THREAD 4
+#define ELEMENTS_PER_THREAD_BIT 2
- *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
- }
- }
-}
+#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
- __global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
- __constant int *mat_kernel __attribute__((max_constant_size (16384))),
- int cols,int rows, int operate_cols, int wholecols, int wholerows)
+__kernel void filter2D_3x3(
+ __global T_IMG *src,
+ __global T_IMG *dst,
+ int src_step,
+ int dst_step,
+ __constant float *mat_kernel,
+ __local T_IMG *local_data,
+ int wholerows,
+ int wholecols,
+ int src_offset_x,
+ int src_offset_y,
+ int dst_offset_x,
+ int dst_offset_y,
+ int cols,
+ int rows,
+ int operate_cols
+)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
- __local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
-
- if(((gY << 2) < rows))
+ if((gY << 2) < rows)
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
- uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
- int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+ T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
+ int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX ] =data;
+ local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
- data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
- con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+ data = src[mad24(selected_row, src_step, selected_cols)];
+ con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
+ local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
- uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
+ T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
- local_data[i * LOCAL_MEM_STEP + lX] =data;
+ local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
- data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
+ data = src[mad24(selected_row, src_step, selected_cols)];
+ local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#endif
}
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+ dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
- uchar16 dst_data;
- dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
+ T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
- int16 sum = (int16)(0);
- uchar16 data;
+ T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
+ T_IMGx4 data;
- for(int i = 0; i < ANCHOR; i++)
+ for(int i = 0; i < FILTER_SIZE; i++)
{
#pragma unroll
- for(int j = 0; j < ANCHOR; j++)
+ for(int j = 0; j < FILTER_SIZE; j++)
{
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
- data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
- sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
+ data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
+ sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
}
}
}
-
if(dst_rows_index < dst_rows_end)
{
- uchar16 sum1 = convert_uchar16_sat(sum);
- sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
- sum1.s0123 : dst_data.s0123;
- sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
- sum1.s4567 : dst_data.s4567;
- sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
- sum1.s89ab : dst_data.s89ab;
- sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
- sum1.scdef : dst_data.scdef;
-
- *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
+ T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
+ tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
+ tmp_dst.SX : dst_data.SX;
+ tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
+ tmp_dst.SY : dst_data.SY;
+ tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
+ tmp_dst.SZ : dst_data.SZ;
+ tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
+ tmp_dst.SW : dst_data.SW;
+ *(__global T_IMGx4 *)dst = tmp_dst;
}
}
}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
-#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4)
-__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
- __global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
- __constant int *mat_kernel __attribute__((max_constant_size (16384))),
- int cols,int rows, int operate_cols, int wholecols, int wholerows)
-{
- int gX = get_global_id(0);
- int gY = get_global_id(1);
-
- int lX = get_local_id(0);
-
- int groupX_size = get_local_size(0);
- int groupX_id = get_group_id(0);
-
- int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
- int rows_start_index = src_offset_y + gY - ANY;
-
- __local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
- if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
- {
- for(int i = 0; i < ROWS_FETCH_C4; ++i)
- {
- if((rows_start_index - src_offset_y) + i < rows + ANY)
- {
-#ifdef BORDER_CONSTANT
- int selected_row = rows_start_index + i;
- int selected_cols = cols_start_index_group + lX;
-
- float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
- int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX ] =data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
-
- data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
- con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
- data = con ? data : 0;
- local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
- }
-#else
- int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
- selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
-
- int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
- selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
-
- float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
- local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
- selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
-
- data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
- local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
- }
-#endif
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if((gY < rows) && (gX < operate_cols))
- {
- int dst_cols_index = dst_offset_x + gX;
- int dst_rows_index = dst_offset_y + gY;
-
- float4 sum = (float4)(0);
-
- for(int i = 0; i < ANCHOR; i++)
- {
- for(int j = 0; j < ANCHOR; j++)
- {
- int local_cols = lX + j;
- sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
- }
- }
-
- *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
- }
-}