From: yao Date: Thu, 23 May 2013 02:55:08 +0000 (+0800) Subject: fix pyrLK's mismatch on Intel GPUs X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1314^2~1272^2~4 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a223b5624f1e535fbcf9754c7a4e2ad8ce4f1432;p=platform%2Fupstream%2Fopencv.git fix pyrLK's mismatch on Intel GPUs --- diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index 1043b84..40a1993 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -46,145 +46,10 @@ //#pragma OPENCL EXTENSION cl_amd_printf : enable -__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - if (y < rows && x < cols * cn) - { - const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x]; - const uchar src_val1 = (src + y * srcStep)[x]; - const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x]; - - ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; - ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; - } -} - -__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - if (y < rows && x < cols * cn) - { - const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x]; - const uchar src_val1 = (src + y * srcStep)[x]; - const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x]; - - ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; - ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; - } -} - -__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int colsn = cols * cn; - - if (y < rows && x < colsn) - { - __global const short* dx_buf_row = dx_buf + y * dx_bufStep; - __global const short* dy_buf_row = dy_buf + y * dy_bufStep; - - const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; - const int xl = x - cn >= 0 ? x - cn : cn + x; - - ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; - ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; - } -} - -__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int colsn = cols * cn; - - if (y < rows && x < colsn) - { - __global const short* dx_buf_row = dx_buf + y * dx_bufStep; - __global const short* dy_buf_row = dy_buf + y * dy_bufStep; - - const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; - const int xl = x - cn >= 0 ? x - cn : cn + x; - - ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; - ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; - } -} - -#define W_BITS 14 -#define W_BITS1 14 - -#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) - -int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y) -{ - int2 ipt; - ipt.x = convert_int_sat_rtn(pt.x); - ipt.y = convert_int_sat_rtn(pt.y); - - float a = pt.x - ipt.x; - float b = pt.y - ipt.y; - - int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); - int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); - int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; - __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; - - return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5); -} - -int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y) -{ - int2 ipt; - ipt.x = convert_int_sat_rtn(pt.x); - ipt.y = convert_int_sat_rtn(pt.y); - - float a = pt.x - ipt.x; - float b = pt.y - ipt.y; - - int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); - int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); - int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; - __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; - - return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1); -} - -float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y) -{ - int2 ipt; - ipt.x = convert_int_sat_rtn(pt.x); - ipt.y = convert_int_sat_rtn(pt.y); - - float a = pt.x - ipt.x; - float b = pt.y - ipt.y; - - float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS)); - float iw01 = (a * (1.0f - b) * (1 << W_BITS)); - float iw10 = ((1.0f - a) * b * (1 << W_BITS)); - float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - - __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn; - __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn; - - return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5; -} - #define BUFFER 64 - +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif #ifdef CPU void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { @@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local smem3[tid] = val3; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - smem3[tid] = val3 += smem3[tid + 128]; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - smem3[tid] = val3 += smem3[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (tid < 32) { - smem1[tid] = val1 += smem1[tid + 32]; - smem2[tid] = val2 += smem2[tid + 32]; - smem3[tid] = val3 += smem3[tid + 32]; + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; + smem3[tid] += smem3[tid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - smem1[tid] = val1 += smem1[tid + 16]; - smem2[tid] = val2 += smem2[tid + 16]; - smem3[tid] = val3 += smem3[tid + 16]; + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; + smem3[tid] += smem3[tid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) { - smem1[tid] = val1 += smem1[tid + 8]; - smem2[tid] = val2 += smem2[tid + 8]; - smem3[tid] = val3 += smem3[tid + 8]; + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + smem3[tid] += smem3[tid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) { - smem1[tid] = val1 += smem1[tid + 4]; - smem2[tid] = val2 += smem2[tid + 4]; - smem3[tid] = val3 += smem3[tid + 4]; + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + smem3[tid] += smem3[tid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) { - smem1[tid] = val1 += smem1[tid + 2]; - smem2[tid] = val2 += smem2[tid + 2]; - smem3[tid] = val3 += smem3[tid + 2]; + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; + smem3[tid] += smem3[tid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) { - smem1[BUFFER] = val1 += smem1[tid + 1]; - smem2[BUFFER] = val2 += smem2[tid + 1]; - smem3[BUFFER] = val3 += smem3[tid + 1]; + smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; + smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; + smem3[BUFFER] = smem3[tid] + smem3[tid + 1]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l smem2[tid] = val2; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) - { - smem1[tid] = (val1 += smem1[tid + 128]); - smem2[tid] = (val2 += smem2[tid + 128]); - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = (val1 += smem1[tid + 64]); - smem2[tid] = (val2 += smem2[tid + 64]); - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (tid < 32) { - smem1[tid] = (val1 += smem1[tid + 32]); - smem2[tid] = (val2 += smem2[tid + 32]); + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - smem1[tid] = (val1 += smem1[tid + 16]); - smem2[tid] = (val2 += smem2[tid + 16]); + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) { - smem1[tid] = (val1 += smem1[tid + 8]); - smem2[tid] = (val2 += smem2[tid + 8]); + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) { - smem1[tid] = (val1 += smem1[tid + 4]); - smem2[tid] = (val2 += smem2[tid + 4]); + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) { - smem1[tid] = (val1 += smem1[tid + 2]); - smem2[tid] = (val2 += smem2[tid + 2]); + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) { - smem1[BUFFER] = (val1 += smem1[tid + 1]); - smem2[BUFFER] = (val2 += smem2[tid + 1]); + smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; + smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid) smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) - { - smem1[tid] = (val1 += smem1[tid + 128]); - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = (val1 += smem1[tid + 64]); - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (tid < 32) { - smem1[tid] = (val1 += smem1[tid + 32]); + smem1[tid] += smem1[tid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - smem1[tid] = (val1 += smem1[tid + 16]); + smem1[tid] += smem1[tid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) { - smem1[tid] = (val1 += smem1[tid + 8]); + smem1[tid] += smem1[tid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) { - smem1[tid] = (val1 += smem1[tid + 4]); + smem1[tid] += smem1[tid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) { - smem1[tid] = (val1 += smem1[tid + 2]); + smem1[tid] += smem1[tid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) { - smem1[BUFFER] = (val1 += smem1[tid + 1]); + smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; } barrier(CLK_LOCAL_MEM_FENCE); } #else -void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +void reduce3(float val1, float val2, float val3, +__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) { smem1[tid] = val1; smem2[tid] = val2; smem3[tid] = val3; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) + if (tid < 32) { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - smem3[tid] = val3 += smem3[tid + 128]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; + smem3[tid] += smem3[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { #endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - smem3[tid] = val3 += smem3[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; + smem3[tid] += smem3[tid + 16]; +#if WAVE_SIZE <16 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) { #endif + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + smem3[tid] += smem3[tid + 8]; - if (tid < 32) - { - volatile __local float* vmem1 = smem1; - volatile __local float* vmem2 = smem2; - volatile __local float* vmem3 = smem3; - - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; - vmem3[tid] = val3 += vmem3[tid + 32]; - - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; - vmem3[tid] = val3 += vmem3[tid + 16]; - - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; - vmem3[tid] = val3 += vmem3[tid + 8]; + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + smem3[tid] += smem3[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; - vmem3[tid] = val3 += vmem3[tid + 4]; + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; + smem3[tid] += smem3[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; - vmem3[tid] = val3 += vmem3[tid + 2]; - - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; - vmem3[tid] = val3 += vmem3[tid + 1]; + smem1[tid] += smem1[tid + 1]; + smem2[tid] += smem2[tid + 1]; + smem3[tid] += smem3[tid + 1]; } } -void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) +void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) { smem1[tid] = val1; smem2[tid] = val2; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) + if (tid < 32) { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { #endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; +#if WAVE_SIZE <16 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) { #endif + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; - if (tid < 32) - { - volatile __local float* vmem1 = smem1; - volatile __local float* vmem2 = smem2; - - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; - - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; - - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; - - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; + smem1[tid] += smem1[tid + 1]; + smem2[tid] += smem2[tid + 1]; } } -void reduce1(float val1, __local float* smem1, int tid) +void reduce1(float val1, __local volatile float* smem1, int tid) { smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); -#if BUFFER > 128 - if (tid < 128) + if (tid < 32) { - smem1[tid] = val1 += smem1[tid + 128]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { #endif - -#if BUFFER > 64 - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - } - barrier(CLK_LOCAL_MEM_FENCE); + smem1[tid] += smem1[tid + 16]; +#if WAVE_SIZE <16 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) { #endif - - if (tid < 32) - { - volatile __local float* vmem1 = smem1; - - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 1]; + smem1[tid] += smem1[tid + 8]; + smem1[tid] += smem1[tid + 4]; + smem1[tid] += smem1[tid + 2]; + smem1[tid] += smem1[tid + 1]; } } #endif #define SCALE (1.0f / (1 << 20)) #define THRESHOLD 0.01f -#define DIMENSION 21 // Image read mode __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index 6de4f97..b2e7d6a 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -56,30 +56,15 @@ namespace cv { namespace ocl { -///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *pyrlk; extern const char *pyrlk_no_image; -extern const char *arithm_mul; } } - struct dim3 { unsigned int x, y, z; }; -struct float2 -{ - float x, y; -}; - -struct int2 -{ - int x, y; -}; - -namespace -{ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11) { winSize.width *= cn; @@ -100,45 +85,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe block.z = patch.z = 1; } -} - -static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) -{ - if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) - { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); - return; - } - - CV_Assert(src1.cols == dst.cols && - src1.rows == dst.rows); - - CV_Assert(src1.type() == dst.type()); - CV_Assert(src1.depth() != CV_8S); - - Context *clCxt = src1.clCxt; - - size_t localThreads[3] = { 16, 16, 1 }; - size_t globalThreads[3] = { src1.cols, - src1.rows, - 1 - }; - - int dst_step1 = dst.cols * dst.elemSize(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - args.push_back( make_pair( sizeof(float), (float *)&scalar )); - - openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth()); -} static void lkSparse_run(oclMat &I, oclMat &J, const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, @@ -151,15 +97,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 }; size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1}; int cn = I.oclchannels(); - char calcErr; - if (level == 0) - { - calcErr = 1; - } - else - { - calcErr = 0; - } + char calcErr = level==0?1:0; vector > args; @@ -198,7 +136,16 @@ static void lkSparse_run(oclMat &I, oclMat &J, { if(isImageSupported) { - openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + stringstream idxStr; + idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth(); + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str()); + + size_t wave_size = queryDeviceInfo(kernel); + static char opt[16] = {0}; + sprintf(opt, " -D WAVE_SIZE=%d", wave_size); + + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, + args, I.oclchannels(), I.depth(), opt); releaseTexture(ITex); releaseTexture(JTex); } @@ -241,8 +188,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); oclMat temp2 = nextPts.reshape(1); - multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f); - //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2); + multiply(1.0f/(1<= 0; level--) { lkSparse_run(prevPyr_[level], nextPyr_[level],