// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
+//#define CV_USE_SUBGROUPS
+
#define EPS 0.001f
#define INF 1E+10F
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
int I0_stride, int I1_stride,
- float w00, float w01, float w10, float w11, int patch_sz, int i)
+ float w00, float w01, float w10, float w11, int patch_sz, int i
+#ifndef CV_USE_SUBGROUPS
+ , __local float2 *smem /*[8]*/
+#endif
+)
{
float sum_diff = 0.0f, sum_diff_sq = 0.0f;
int n = patch_sz * patch_sz;
sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
+#ifdef CV_USE_SUBGROUPS
sum_diff = sub_group_reduce_add(sum_diff);
sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
+#else
+ barrier(CLK_LOCAL_MEM_FENCE);
+ smem[i] = (float2)(sum_diff, sum_diff_sq);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (i < 4)
+ smem[i] += smem[i + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (i < 2)
+ smem[i] += smem[i + 2];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (i == 0)
+ smem[0] += smem[1];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ float2 reduce_add_result = smem[0];
+ sum_diff = reduce_add_result.x;
+ sum_diff_sq = reduce_add_result.y;
+#endif
return sum_diff_sq - sum_diff * sum_diff / n;
}
+__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
__global float *Sx_ptr, __global float *Sy_ptr)
{
int id = get_global_id(0);
- int is = id / 8;
- if (id >= (hs * 8)) return;
+ int is = get_group_id(0);
int i = is * patch_stride;
int j = 0;
Sy_ptr[is * ws] = prev_Uy;
j += patch_stride;
+#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
+#define EXTRA_ARGS_computeSSDMeanNorm sid
+#else
+ __local float2 smem[8];
+ int sid = get_local_id(0);
+#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
+#endif
for (int js = 1; js < ws; js++, j += patch_stride)
{
float min_SSD, cur_SSD;
INIT_BILINEAR_WEIGHTS(Ux, Uy);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
- w, w_ext, w00, w01, w10, w11, psz, sid);
+ w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
- w, w_ext, w00, w01, w10, w11, psz, sid);
+ w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{
Ux = prev_Ux;
Sx_ptr[is * ws + js] = Ux;
Sy_ptr[is * ws + js] = Uy;
}
+#undef EXTRA_ARGS_computeSSDMeanNorm
}
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
}
}
+__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
int w, int h, int ws, int hs, int pyr_level,
__global float *Sx_ptr, __global float *Sy_ptr)
{
int id = get_global_id(0);
- int is = id / 8;
- if (id >= (hs * 8)) return;
+ int is = get_group_id(0);
is = (hs - 1 - is);
int i = is * patch_stride;
float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11;
+#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
+#define EXTRA_ARGS_computeSSDMeanNorm sid
+#else
+ __local float2 smem[8];
+ int sid = get_local_id(0);
+#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
+#endif
for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
{
float min_SSD, cur_SSD;
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
- w, w_ext, w00, w01, w10, w11, psz, sid);
+ w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
- w, w_ext, w00, w01, w10, w11, psz, sid);
+ w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{
Sx_ptr[is * ws + js] = Ux.y;
Sy_ptr[is * ws + js] = Uy.y;
}
}
+#undef EXTRA_ARGS_computeSSDMeanNorm
}
__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,