//#pragma OPENCL EXTENSION cl_amd_printf : enable
-__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- if (y < rows && x < cols * cn)
- {
- const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
- const uchar src_val1 = (src + y * srcStep)[x];
- const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
-
- ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
- ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
- }
-}
-
-__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- if (y < rows && x < cols * cn)
- {
- const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
- const uchar src_val1 = (src + y * srcStep)[x];
- const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
-
- ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
- ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
- }
-}
-
-__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- const int colsn = cols * cn;
-
- if (y < rows && x < colsn)
- {
- __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
- __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
- const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
- const int xl = x - cn >= 0 ? x - cn : cn + x;
-
- ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
- ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
- }
-}
-
-__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- const int colsn = cols * cn;
-
- if (y < rows && x < colsn)
- {
- __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
- __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
- const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
- const int xl = x - cn >= 0 ? x - cn : cn + x;
-
- ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
- ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
- }
-}
-
-#define W_BITS 14
-#define W_BITS1 14
-
-#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
-
-int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
-{
- int2 ipt;
- ipt.x = convert_int_sat_rtn(pt.x);
- ipt.y = convert_int_sat_rtn(pt.y);
-
- float a = pt.x - ipt.x;
- float b = pt.y - ipt.y;
-
- int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
- int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
- int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
- int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
- __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
- __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
- return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
-}
-
-int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
-{
- int2 ipt;
- ipt.x = convert_int_sat_rtn(pt.x);
- ipt.y = convert_int_sat_rtn(pt.y);
-
- float a = pt.x - ipt.x;
- float b = pt.y - ipt.y;
-
- int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
- int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
- int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
- int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
- __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
- __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
- return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
-}
-
-float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
-{
- int2 ipt;
- ipt.x = convert_int_sat_rtn(pt.x);
- ipt.y = convert_int_sat_rtn(pt.y);
-
- float a = pt.x - ipt.x;
- float b = pt.y - ipt.y;
-
- float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
- float iw01 = (a * (1.0f - b) * (1 << W_BITS));
- float iw10 = ((1.0f - a) * b * (1 << W_BITS));
- float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
- __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
- __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
-
- return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
-}
-
#define BUFFER 64
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
#ifdef CPU
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
- {
- smem1[tid] = val1 += smem1[tid + 128];
- smem2[tid] = val2 += smem2[tid + 128];
- smem3[tid] = val3 += smem3[tid + 128];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = val1 += smem1[tid + 64];
- smem2[tid] = val2 += smem2[tid + 64];
- smem3[tid] = val3 += smem3[tid + 64];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
if (tid < 32)
{
- smem1[tid] = val1 += smem1[tid + 32];
- smem2[tid] = val2 += smem2[tid + 32];
- smem3[tid] = val3 += smem3[tid + 32];
+ smem1[tid] += smem1[tid + 32];
+ smem2[tid] += smem2[tid + 32];
+ smem3[tid] += smem3[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
- smem1[tid] = val1 += smem1[tid + 16];
- smem2[tid] = val2 += smem2[tid + 16];
- smem3[tid] = val3 += smem3[tid + 16];
+ smem1[tid] += smem1[tid + 16];
+ smem2[tid] += smem2[tid + 16];
+ smem3[tid] += smem3[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
- smem1[tid] = val1 += smem1[tid + 8];
- smem2[tid] = val2 += smem2[tid + 8];
- smem3[tid] = val3 += smem3[tid + 8];
+ smem1[tid] += smem1[tid + 8];
+ smem2[tid] += smem2[tid + 8];
+ smem3[tid] += smem3[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
- smem1[tid] = val1 += smem1[tid + 4];
- smem2[tid] = val2 += smem2[tid + 4];
- smem3[tid] = val3 += smem3[tid + 4];
+ smem1[tid] += smem1[tid + 4];
+ smem2[tid] += smem2[tid + 4];
+ smem3[tid] += smem3[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
- smem1[tid] = val1 += smem1[tid + 2];
- smem2[tid] = val2 += smem2[tid + 2];
- smem3[tid] = val3 += smem3[tid + 2];
+ smem1[tid] += smem1[tid + 2];
+ smem2[tid] += smem2[tid + 2];
+ smem3[tid] += smem3[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
- smem1[BUFFER] = val1 += smem1[tid + 1];
- smem2[BUFFER] = val2 += smem2[tid + 1];
- smem3[BUFFER] = val3 += smem3[tid + 1];
+ smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+ smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
+ smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
- {
- smem1[tid] = (val1 += smem1[tid + 128]);
- smem2[tid] = (val2 += smem2[tid + 128]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = (val1 += smem1[tid + 64]);
- smem2[tid] = (val2 += smem2[tid + 64]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
if (tid < 32)
{
- smem1[tid] = (val1 += smem1[tid + 32]);
- smem2[tid] = (val2 += smem2[tid + 32]);
+ smem1[tid] += smem1[tid + 32];
+ smem2[tid] += smem2[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
- smem1[tid] = (val1 += smem1[tid + 16]);
- smem2[tid] = (val2 += smem2[tid + 16]);
+ smem1[tid] += smem1[tid + 16];
+ smem2[tid] += smem2[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
- smem1[tid] = (val1 += smem1[tid + 8]);
- smem2[tid] = (val2 += smem2[tid + 8]);
+ smem1[tid] += smem1[tid + 8];
+ smem2[tid] += smem2[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
- smem1[tid] = (val1 += smem1[tid + 4]);
- smem2[tid] = (val2 += smem2[tid + 4]);
+ smem1[tid] += smem1[tid + 4];
+ smem2[tid] += smem2[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
- smem1[tid] = (val1 += smem1[tid + 2]);
- smem2[tid] = (val2 += smem2[tid + 2]);
+ smem1[tid] += smem1[tid + 2];
+ smem2[tid] += smem2[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
- smem1[BUFFER] = (val1 += smem1[tid + 1]);
- smem2[BUFFER] = (val2 += smem2[tid + 1]);
+ smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+ smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
- {
- smem1[tid] = (val1 += smem1[tid + 128]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = (val1 += smem1[tid + 64]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
if (tid < 32)
{
- smem1[tid] = (val1 += smem1[tid + 32]);
+ smem1[tid] += smem1[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
- smem1[tid] = (val1 += smem1[tid + 16]);
+ smem1[tid] += smem1[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
- smem1[tid] = (val1 += smem1[tid + 8]);
+ smem1[tid] += smem1[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
- smem1[tid] = (val1 += smem1[tid + 4]);
+ smem1[tid] += smem1[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
- smem1[tid] = (val1 += smem1[tid + 2]);
+ smem1[tid] += smem1[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
- smem1[BUFFER] = (val1 += smem1[tid + 1]);
+ smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
-void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
+void reduce3(float val1, float val2, float val3,
+__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
+ if (tid < 32)
{
- smem1[tid] = val1 += smem1[tid + 128];
- smem2[tid] = val2 += smem2[tid + 128];
- smem3[tid] = val3 += smem3[tid + 128];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 32];
+ smem2[tid] += smem2[tid + 32];
+ smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16) {
#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = val1 += smem1[tid + 64];
- smem2[tid] = val2 += smem2[tid + 64];
- smem3[tid] = val3 += smem3[tid + 64];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 16];
+ smem2[tid] += smem2[tid + 16];
+ smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 8) {
#endif
+ smem1[tid] += smem1[tid + 8];
+ smem2[tid] += smem2[tid + 8];
+ smem3[tid] += smem3[tid + 8];
- if (tid < 32)
- {
- volatile __local float* vmem1 = smem1;
- volatile __local float* vmem2 = smem2;
- volatile __local float* vmem3 = smem3;
-
- vmem1[tid] = val1 += vmem1[tid + 32];
- vmem2[tid] = val2 += vmem2[tid + 32];
- vmem3[tid] = val3 += vmem3[tid + 32];
-
- vmem1[tid] = val1 += vmem1[tid + 16];
- vmem2[tid] = val2 += vmem2[tid + 16];
- vmem3[tid] = val3 += vmem3[tid + 16];
-
- vmem1[tid] = val1 += vmem1[tid + 8];
- vmem2[tid] = val2 += vmem2[tid + 8];
- vmem3[tid] = val3 += vmem3[tid + 8];
+ smem1[tid] += smem1[tid + 4];
+ smem2[tid] += smem2[tid + 4];
+ smem3[tid] += smem3[tid + 4];
- vmem1[tid] = val1 += vmem1[tid + 4];
- vmem2[tid] = val2 += vmem2[tid + 4];
- vmem3[tid] = val3 += vmem3[tid + 4];
+ smem1[tid] += smem1[tid + 2];
+ smem2[tid] += smem2[tid + 2];
+ smem3[tid] += smem3[tid + 2];
- vmem1[tid] = val1 += vmem1[tid + 2];
- vmem2[tid] = val2 += vmem2[tid + 2];
- vmem3[tid] = val3 += vmem3[tid + 2];
-
- vmem1[tid] = val1 += vmem1[tid + 1];
- vmem2[tid] = val2 += vmem2[tid + 1];
- vmem3[tid] = val3 += vmem3[tid + 1];
+ smem1[tid] += smem1[tid + 1];
+ smem2[tid] += smem2[tid + 1];
+ smem3[tid] += smem3[tid + 1];
}
}
-void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
+void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
+ if (tid < 32)
{
- smem1[tid] = val1 += smem1[tid + 128];
- smem2[tid] = val2 += smem2[tid + 128];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 32];
+ smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16) {
#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = val1 += smem1[tid + 64];
- smem2[tid] = val2 += smem2[tid + 64];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 16];
+ smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 8) {
#endif
+ smem1[tid] += smem1[tid + 8];
+ smem2[tid] += smem2[tid + 8];
- if (tid < 32)
- {
- volatile __local float* vmem1 = smem1;
- volatile __local float* vmem2 = smem2;
-
- vmem1[tid] = val1 += vmem1[tid + 32];
- vmem2[tid] = val2 += vmem2[tid + 32];
-
- vmem1[tid] = val1 += vmem1[tid + 16];
- vmem2[tid] = val2 += vmem2[tid + 16];
+ smem1[tid] += smem1[tid + 4];
+ smem2[tid] += smem2[tid + 4];
- vmem1[tid] = val1 += vmem1[tid + 8];
- vmem2[tid] = val2 += vmem2[tid + 8];
+ smem1[tid] += smem1[tid + 2];
+ smem2[tid] += smem2[tid + 2];
- vmem1[tid] = val1 += vmem1[tid + 4];
- vmem2[tid] = val2 += vmem2[tid + 4];
-
- vmem1[tid] = val1 += vmem1[tid + 2];
- vmem2[tid] = val2 += vmem2[tid + 2];
-
- vmem1[tid] = val1 += vmem1[tid + 1];
- vmem2[tid] = val2 += vmem2[tid + 1];
+ smem1[tid] += smem1[tid + 1];
+ smem2[tid] += smem2[tid + 1];
}
}
-void reduce1(float val1, __local float* smem1, int tid)
+void reduce1(float val1, __local volatile float* smem1, int tid)
{
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
-#if BUFFER > 128
- if (tid < 128)
+ if (tid < 32)
{
- smem1[tid] = val1 += smem1[tid + 128];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16) {
#endif
-
-#if BUFFER > 64
- if (tid < 64)
- {
- smem1[tid] = val1 += smem1[tid + 64];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+ } barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 8) {
#endif
-
- if (tid < 32)
- {
- volatile __local float* vmem1 = smem1;
-
- vmem1[tid] = val1 += vmem1[tid + 32];
- vmem1[tid] = val1 += vmem1[tid + 16];
- vmem1[tid] = val1 += vmem1[tid + 8];
- vmem1[tid] = val1 += vmem1[tid + 4];
- vmem1[tid] = val1 += vmem1[tid + 2];
- vmem1[tid] = val1 += vmem1[tid + 1];
+ smem1[tid] += smem1[tid + 8];
+ smem1[tid] += smem1[tid + 4];
+ smem1[tid] += smem1[tid + 2];
+ smem1[tid] += smem1[tid + 1];
}
}
#endif
#define SCALE (1.0f / (1 << 20))
#define THRESHOLD 0.01f
-#define DIMENSION 21
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
{
namespace ocl
{
-///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *pyrlk;
extern const char *pyrlk_no_image;
-extern const char *arithm_mul;
}
}
-
struct dim3
{
unsigned int x, y, z;
};
-struct float2
-{
- float x, y;
-};
-
-struct int2
-{
- int x, y;
-};
-
-namespace
-{
void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
{
winSize.width *= cn;
block.z = patch.z = 1;
}
-}
-
-static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
-{
- if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
- {
- CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
- return;
- }
-
- CV_Assert(src1.cols == dst.cols &&
- src1.rows == dst.rows);
-
- CV_Assert(src1.type() == dst.type());
- CV_Assert(src1.depth() != CV_8S);
-
- Context *clCxt = src1.clCxt;
-
- size_t localThreads[3] = { 16, 16, 1 };
- size_t globalThreads[3] = { src1.cols,
- src1.rows,
- 1
- };
-
- int dst_step1 = dst.cols * dst.elemSize();
- vector<pair<size_t , const void *> > args;
- args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
- args.push_back( make_pair( sizeof(float), (float *)&scalar ));
-
- openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth());
-}
static void lkSparse_run(oclMat &I, oclMat &J,
const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 };
size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
int cn = I.oclchannels();
- char calcErr;
- if (level == 0)
- {
- calcErr = 1;
- }
- else
- {
- calcErr = 0;
- }
+ char calcErr = level==0?1:0;
vector<pair<size_t , const void *> > args;
{
if(isImageSupported)
{
- openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth());
+ stringstream idxStr;
+ idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
+ cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
+
+ size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
+ static char opt[16] = {0};
+ sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+
+ openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads,
+ args, I.oclchannels(), I.depth(), opt);
releaseTexture(ITex);
releaseTexture(JTex);
}
oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
oclMat temp2 = nextPts.reshape(1);
- multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
- //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+ multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1));
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids.
-
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
}
// dI/dx ~ Ix, dI/dy ~ Iy
-
for (int level = maxLevel; level >= 0; level--)
{
lkSparse_run(prevPyr_[level], nextPyr_[level],