From 719e8674ad78820ef8980eb0781438e36aed9a94 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 12 Apr 2013 17:38:59 +0800 Subject: [PATCH] fix the compile errors on Mac --- modules/nonfree/src/surf.ocl.cpp | 2 +- modules/ocl/src/filtering.cpp | 3 +- modules/ocl/src/opencl/arithm_flip.cl | 42 +++-- modules/ocl/src/opencl/filter_sep_row.cl | 181 +++++++++++----------- modules/ocl/src/opencl/filtering_laplacian.cl | 90 +++++------ modules/ocl/src/opencl/imgproc_integral.cl | 36 +++-- modules/ocl/src/opencl/imgproc_warpAffine.cl | 111 ++++++------- modules/ocl/src/opencl/imgproc_warpPerspective.cl | 115 +++++++------- modules/ocl/src/opencl/match_template.cl | 11 +- 9 files changed, 310 insertions(+), 281 deletions(-) diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 78864c6..acc188e 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -77,7 +77,7 @@ namespace cv size_t wave_size = 0; queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); - std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast(wave_size)); + std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast(wave_size)); OPTION_INIT = true; } openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 2f4a494..cc07209 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], - rectKernel?"-D RECTKERNEL":"", - s); + s, rectKernel?"-D RECTKERNEL":""); vector< pair > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl index f492524..d0e6782 100644 --- a/modules/ocl/src/opencl/arithm_flip.cl +++ b/modules/ocl/src/opencl/arithm_flip.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); @@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); @@ -314,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src if (x < thread_cols && y < rows) { int src_index_0 = mad24(y, src_step, (x) + src_offset); - int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); - - int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset); - uchar data0 = *(src + src_index_0); - uchar data1 = *(src + src_index_1); + *(dst + dst_index_1) = data0; + int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); + int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); + uchar data1 = *(src + src_index_1); *(dst + dst_index_0) = data1; - *(dst + dst_index_1) = data0; } } __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset, diff --git a/modules/ocl/src/opencl/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl index bfe6cd4..5524041 100644 --- a/modules/ocl/src/opencl/filter_sep_row.cl +++ b/modules/ocl/src/opencl/filter_sep_row.cl @@ -96,18 +96,18 @@ The info above maybe obsolete. ***********************************************************************************/ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 - (__global const uchar * restrict src, - __global float * dst, - const int dst_cols, - const int dst_rows, - const int src_whole_cols, - const int src_whole_rows, - const int src_step_in_pixel, - const int src_offset_x, - const int src_offset_y, - const int dst_step_in_pixel, - const int radiusy, - __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) +(__global const uchar * restrict src, + __global float * dst, + const int dst_cols, + const int dst_rows, + const int src_whole_cols, + const int src_whole_rows, + const int src_step_in_pixel, + const int src_offset_x, + const int src_offset_y, + const int dst_step_in_pixel, + const int radiusy, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) { int x = get_global_id(0)<<2; int y = get_global_id(1); @@ -122,17 +122,17 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); //read pixels from src - for(i = 0;i 0)) ? current_addr : 0; temp[i] = *(__global uchar4*)&src[current_addr]; } //judge if read out of boundary - for(i = 0;isrc_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); int4 index[READ_TIMES_ROW]; int4 addr; @@ -148,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ if(not_all_in_range) { //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); + } } } @@ -207,7 +207,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////32FC1//////////////////////////////////////////////////////// @@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -295,17 +295,17 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * data); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data); + } } } @@ -318,7 +318,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -410,17 +410,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); - } + data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); + } } } @@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; } - #endif +#endif } } } @@ -519,10 +519,10 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - for(int j = 0; j < ANCHOR; j++) - { - int local_cols = lX + j; - sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]; + for(int j = 0; j < ANCHOR; j++) + { + int local_cols = lX + j; + sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]); } } diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl index 80f460b..c546957 100644 --- a/modules/ocl/src/opencl/imgproc_integral.cl +++ b/modules/ocl/src/opencl/imgproc_integral.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif #define LSIZE 256 #define LSIZE_1 255 @@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float gid = gid << 1; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); - src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); + src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); + src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float } barrier(CLK_LOCAL_MEM_FENCE); int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; @@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo src_step = src_step >> 4; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; - sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; - sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0; + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo barrier(CLK_LOCAL_MEM_FENCE); if(gid == 0 && (i + lid) <= rows) { - sum[sum_offset + i + lid] = 0; - sqsum[sqsum_offset + i + lid] = 0; + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; } if(i + lid == 0) { int loc0 = gid * 2 * sum_step; int loc1 = gid * 2 * sqsum_step; - for(int k = 1;k <= 8;k++) + for(int k = 1; k <= 8; k++) { if(gid * 8 + k > cols) break; sum[sum_offset + loc0 + k * sum_step / 4] = 0; @@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo } int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl index 8aee183..6eee8d3 100644 --- a/modules/ocl/src/opencl/imgproc_warpAffine.cl +++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl @@ -47,8 +47,12 @@ //warpAffine kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -58,7 +62,6 @@ typedef float4 F4; #define convert_F4 convert_float4 #endif - #define INTER_BITS 5 #define INTER_TAB_SIZE (1 << INTER_BITS) #define INTER_SCALE 1.f/INTER_TAB_SIZE @@ -81,8 +84,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -123,14 +126,14 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -180,7 +183,7 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob spos1 = src_offset + sy * srcStep + sx + 1; spos2 = src_offset + (sy+1) * srcStep + sx; spos3 = src_offset + (sy+1) * srcStep + sx + 1; - + v0.s0 = scon0.s0 ? src[spos0.s0] : 0; v1.s0 = scon1.s0 ? src[spos1.s0] : 0; v2.s0 = scon2.s0 ? src[spos2.s0] : 0; @@ -200,22 +203,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob v1.s3 = scon1.s3 ? src[spos1.s3] : 0; v2.s3 = scon2.s3 ? src[spos2.s3] : 0; v3.s3 = scon3.s3 ? src[spos3.s3] : 0; - + short4 itab0, itab1, itab2, itab3; float4 taby, tabx; taby = INTER_SCALE * convert_float4(ay); tabx = INTER_SCALE * convert_float4(ax); - itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE )); + itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); + itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); int4 val; uchar4 tval; val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) - + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); + + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); @@ -228,8 +231,8 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob } __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -255,10 +258,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -288,7 +291,7 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -309,8 +312,8 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -333,8 +336,8 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global } __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -386,8 +389,8 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo } __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -418,10 +421,10 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -447,14 +450,14 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -477,8 +480,8 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob ***********************************************************************************************/ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -501,8 +504,8 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int } __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -548,12 +551,12 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[(dst_offset>>2)+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -617,8 +620,8 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -636,13 +639,13 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i short sy0 = (short)(Y0 >> AB_BITS); if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : (float4)0; } } __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -691,12 +694,12 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[dst_offset+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -754,5 +757,5 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl index a37ffa1..edbe42c 100644 --- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl +++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl @@ -47,8 +47,12 @@ //wrapPerspective kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -81,8 +85,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -112,14 +116,14 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ int i; #pragma unroll 4 for(i=0; i<4; i++) - v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; + v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; short itab[4]; float tab1y[2], tab1x[2]; @@ -170,8 +174,8 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ } __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -190,15 +194,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * short ay = (short)(Y & (INTER_TAB_SIZE-1)); short ax = (short)(X & (INTER_TAB_SIZE-1)); - uchar v[16]; + uchar v[16]; int i, j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -227,7 +231,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -249,8 +253,8 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -273,8 +277,8 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl } __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, int4 v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; int itab0, itab1, itab2, itab3; float taby, tabx; @@ -323,8 +327,8 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, } __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -352,10 +356,10 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -381,14 +385,14 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -411,8 +415,8 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -434,8 +438,8 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst } __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * float v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; float tab[4]; float taby[2], tabx[2]; @@ -483,8 +487,8 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * } __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -546,8 +550,8 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -564,13 +568,13 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d short sy = (short)Y; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : (float)0; } } __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -622,8 +626,8 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 } __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -680,5 +684,6 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } + diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 857f891..0dd3e69 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0 __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); for(j = 0; j < tpl_cols; j ++) { - sum = mad24(img_ptr[j], tpl_ptr[j], sum); + sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); } } - res[res_idx] = sum; + res[res_idx] = (float)sum; } } @@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0 sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); } } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; + res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); } } @@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { - float sum = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); res[res_idx] -= sum * tpl_sum; } } -- 2.7.4