From 7b08d5ec69c281fff3fb6f77dce314efeb4539bf Mon Sep 17 00:00:00 2001 From: Peng Xiao Date: Sat, 20 Apr 2013 00:34:37 +0800 Subject: [PATCH] Add OpenCL stereo CSBP implementation --- modules/ocl/include/opencv2/ocl/ocl.hpp | 38 ++ modules/ocl/src/opencl/stereocsbp.cl | 1135 +++++++++++++++++++++++++++++++ modules/ocl/src/stereo_csbp.cpp | 792 +++++++++++++++++++++ modules/ocl/test/test_calib3d.cpp | 76 ++- 4 files changed, 2033 insertions(+), 8 deletions(-) create mode 100644 modules/ocl/src/opencl/stereocsbp.cl create mode 100644 modules/ocl/src/stereo_csbp.cpp diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 613179f..059ae02 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1731,6 +1731,44 @@ namespace cv std::vector datas; oclMat out; }; + class CV_EXPORTS StereoConstantSpaceBP + { + public: + enum { DEFAULT_NDISP = 128 }; + enum { DEFAULT_ITERS = 8 }; + enum { DEFAULT_LEVELS = 4 }; + enum { DEFAULT_NR_PLANE = 4 }; + static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane); + explicit StereoConstantSpaceBP( + int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int nr_plane = DEFAULT_NR_PLANE, + int msg_type = CV_32F); + StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, + float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, + int min_disp_th = 0, + int msg_type = CV_32F); + void operator()(const oclMat &left, const oclMat &right, oclMat &disparity); + int ndisp; + int iters; + int levels; + int nr_plane; + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; + int min_disp_th; + int msg_type; + bool use_local_init_data_cost; + private: + oclMat u[2], d[2], l[2], r[2]; + oclMat disp_selected_pyr[2]; + oclMat data_cost; + oclMat data_cost_selected; + oclMat temp; + oclMat out; + }; } } #if defined _MSC_VER && _MSC_VER >= 1200 diff --git a/modules/ocl/src/opencl/stereocsbp.cl b/modules/ocl/src/opencl/stereocsbp.cl new file mode 100644 index 0000000..f855ee0 --- /dev/null +++ b/modules/ocl/src/opencl/stereocsbp.cl @@ -0,0 +1,1135 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// Jin Ma, jin@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#ifndef FLT_MAX +#define FLT_MAX CL_FLT_MAX +#endif + +#ifndef SHRT_MAX +#define SHRT_MAX CL_SHORT_MAX +#endif + + +/////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////get_first_k_initial_global////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void get_first_k_initial_global_0(__global short *data_cost_selected_, __global short *selected_disp_pyr, + __global short *ctemp, int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + for(int i = 0; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + + for(int d = 0; d < cndisp; d++) + { + short cur = data_cost[d * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost [id * cdisp_step1] = SHRT_MAX; + } + } +} +__kernel void get_first_k_initial_global_1(__global float *data_cost_selected_, __global float *selected_disp_pyr, + __global float *ctemp, int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + for(int i = 0; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for(int d = 0; d < cndisp; d++) + { + float cur = data_cost[d * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost [id * cdisp_step1] = FLT_MAX; + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////get_first_k_initial_local//////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void get_first_k_initial_local_0(__global short *data_cost_selected_, __global short *selected_disp_pyr, + __global short *ctemp,int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + int nr_local_minimum = 0; + + short prev = data_cost[0 * cdisp_step1]; + short cur = data_cost[1 * cdisp_step1]; + short next = data_cost[2 * cdisp_step1]; + + for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++) + { + + if (cur < prev && cur < next) + { + data_cost_selected[nr_local_minimum * cdisp_step1] = cur; + selected_disparity[nr_local_minimum * cdisp_step1] = d; + data_cost[d * cdisp_step1] = SHRT_MAX; + + nr_local_minimum++; + } + + prev = cur; + cur = next; + next = data_cost[(d + 1) * cdisp_step1]; + } + + for (int i = nr_local_minimum; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + + for (int d = 0; d < cndisp; d++) + { + cur = data_cost[d * cdisp_step1]; + if (cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost[id * cdisp_step1] = SHRT_MAX; + } + } +} + +__kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, __global float *selected_disp_pyr, + __global float *ctemp,int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + int nr_local_minimum = 0; + + float prev = data_cost[0 * cdisp_step1]; + float cur = data_cost[1 * cdisp_step1]; + float next = data_cost[2 * cdisp_step1]; + + for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++) + { + if (cur < prev && cur < next) + { + data_cost_selected[nr_local_minimum * cdisp_step1] = cur; + selected_disparity[nr_local_minimum * cdisp_step1] = d; + data_cost[d * cdisp_step1] = FLT_MAX ; + + nr_local_minimum++; + } + + prev = cur; + cur = next; + next = data_cost[(d + 1) * cdisp_step1]; + } + for (int i = nr_local_minimum; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for (int d = 0; d < cndisp; d++) + { + cur = data_cost[d * cdisp_step1]; + if (cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost[id * cdisp_step1] = FLT_MAX; + } + } +} + +/////////////////////////////////////////////////////////////// +/////////////////////// init data cost //////////////////////// +/////////////////////////////////////////////////////////////// +float compute_3(__global uchar* left, __global uchar* right, + float cdata_weight, float cmax_data_term) +{ + float tb = 0.114f * abs((int)left[0] - right[0]); + float tg = 0.587f * abs((int)left[1] - right[1]); + float tr = 0.299f * abs((int)left[2] - right[2]); + + return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); +} +float compute_1(__global uchar* left, __global uchar* right, + float cdata_weight, float cmax_data_term) +{ + return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term); +} +short round_short(float v){ + return convert_short_sat_rte(v); +} +/////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////init_data_cost/////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int channels, + int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, + int cth, int cimg_step, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + for(int d = 0; d < cndisp; ++d) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int xr = xi - d; + if(d < cth || xr < 0) + val += cdata_weight * cmax_data_term; + else + { + __global uchar *lle = cleft + yi * cimg_step + xi * channels; + __global uchar *lri = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = round_short(val); + } + } +} +__kernel void init_data_cost_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int channels, + int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, + int cth, int cimg_step, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + for(int d = 0; d < cndisp; ++d) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int xr = xi - d; + if(d < cth || xr < 0) + val += cdata_weight * cmax_data_term; + else + { + __global uchar* lle = cleft + yi * cimg_step + xi * channels; + __global uchar* lri = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = val; + } + } +} +//////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////init_data_cost_reduce////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void init_data_cost_reduce_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, + __local float *smem, int level, int rows, int cols, int h, int winsz, int channels, + int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth, + int cdisp_step1, int cmsg_step1) +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + //int d = (blockIdx.y / h) * blockDim.z + threadIdx.z; + int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + if (d < cndisp) + { + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - d < 0 || d < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + __local volatile float* vdline = smem + winsz * get_local_id(2); + + if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; + if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; + if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; + if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; + if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; + if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + + __global short* data_cost = ctemp + y_out * cmsg_step1 + x_out; + + if (tid == 0) + data_cost[cdisp_step1 * d] = convert_short_sat_rte(dline[0]); + } +} + +__kernel void init_data_cost_reduce_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright, + __local float *smem, int level, int rows, int cols, int h, int winsz, int channels, + int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth, + int cdisp_step1, int cmsg_step1) +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + if (d < cndisp) + { + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + //float val = 528.0f; + + if (x0 + tid < cols) + { + if (x0 + tid - d < 0 || d < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + __local volatile float* vdline = smem + winsz * get_local_id(2); + + if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; + if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; + if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; + if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; + if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; + if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + + __global float *data_cost = ctemp + y_out * cmsg_step1 + x_out; + + if (tid == 0) + data_cost[cdisp_step1 * d] = dline[0]; + } +} + +/////////////////////////////////////////////////////////////// +////////////////////// compute data cost ////////////////////// +/////////////////////////////////////////////////////////////// +__kernel void compute_data_cost_0(__global const short *selected_disp_pyr, __global short *data_cost_, + __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int nr_plane, int channels, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) +{ + + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global const short *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2; + __global short *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane; d++) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + int xr = xi - sel_disp; + + if (xr < 0 || sel_disp < cth) + val += cdata_weight * cmax_data_term; + + else + { + __global uchar* left_x = cleft + yi * cimg_step + xi * channels; + __global uchar* right_x = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(left_x, right_x, cdata_weight, cmax_data_term); + else + val += compute_3(left_x, right_x, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = convert_short_sat_rte(val); + } + } +} +__kernel void compute_data_cost_1(__global const float *selected_disp_pyr, __global float *data_cost_, + __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int nr_plane, int channels, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) +{ + + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global const float *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2; + __global float *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane; d++) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + int xr = xi - sel_disp; + + if (xr < 0 || sel_disp < cth) + val += cdata_weight * cmax_data_term; + else + { + __global uchar* left_x = cleft + yi * cimg_step + xi * channels; + __global uchar* right_x = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(left_x, right_x, cdata_weight, cmax_data_term); + else + val += compute_3(left_x, right_x, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = val; + } + } +} +//////////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////compute_data_cost_reduce////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void compute_data_cost_reduce_0(__global const short* selected_disp_pyr, __global short* data_cost_, + __global uchar *cleft, __global uchar *cright,__local float *smem, + int level, int rows, int cols, int h, int nr_plane, + int channels, int winsz, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, + float cdata_weight, float cmax_data_term, int cimg_step,int cth) + +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + __global const short* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2; + __global short* data_cost = data_cost_ + y_out * cmsg_step1 + x_out; + + if (d < nr_plane) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - sel_disp < 0 || sel_disp < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + if(d < nr_plane) + { + + // if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + //if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + __local volatile float* vdline = smem + winsz * get_local_id(2); + + if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; + if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; + if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; + if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; + if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; + if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + + if (tid == 0) + data_cost[cdisp_step1 * d] = convert_short_sat_rte(vdline[0]); + } +} + +__kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr, __global float *data_cost_, + __global uchar *cleft, __global uchar *cright, __local float *smem, + int level, int rows, int cols, int h, int nr_plane, + int channels, int winsz, + int cmsg_step1, int cmsg_step2, int cdisp_step1,int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) + +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + __global const float *selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2; + __global float *data_cost = data_cost_ + y_out * cmsg_step1 + x_out; + + if (d < nr_plane) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - sel_disp < 0 || sel_disp < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + if(d < nr_plane) + { + + //if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + //if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + __local volatile float* vdline = smem + winsz * get_local_id(2); + + if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; + if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; + if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; + if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; + if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; + if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + + if (tid == 0) + data_cost[cdisp_step1 * d] = vdline[0]; + } +} + +/////////////////////////////////////////////////////////////// +//////////////////////// init message ///////////////////////// +/////////////////////////////////////////////////////////////// +void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new, + __global short *r_new, __global const short *u_cur, __global const short *d_cur, + __global const short *l_cur, __global const short *r_cur, + __global short *data_cost_selected, __global short *disparity_selected_new, + __global short *data_cost_new, __global const short* data_cost_cur, + __global const short *disparity_selected_cur, + int nr_plane, int nr_plane2, + int cdisp_step1, int cdisp_step2) +{ + for(int i = 0; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + for(int j = 0; j < nr_plane2; j++) + { + short cur = data_cost_new[j * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = j; + } + } + + data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1]; + disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2]; + + u_new[i * cdisp_step1] = u_cur[id * cdisp_step2]; + d_new[i * cdisp_step1] = d_cur[id * cdisp_step2]; + l_new[i * cdisp_step1] = l_cur[id * cdisp_step2]; + r_new[i * cdisp_step1] = r_cur[id * cdisp_step2]; + + data_cost_new[id * cdisp_step1] = SHRT_MAX; + } +} +void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new, + __global float *r_new, __global const float *u_cur, __global const float *d_cur, + __global const float *l_cur, __global const float *r_cur, + __global float *data_cost_selected, __global float *disparity_selected_new, + __global float *data_cost_new, __global const float *data_cost_cur, + __global const float *disparity_selected_cur, + int nr_plane, int nr_plane2, + int cdisp_step1, int cdisp_step2) +{ + for(int i = 0; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for(int j = 0; j < nr_plane2; j++) + { + float cur = data_cost_new[j * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = j; + } + } + + data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1]; + disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2]; + + u_new[i * cdisp_step1] = u_cur[id * cdisp_step2]; + d_new[i * cdisp_step1] = d_cur[id * cdisp_step2]; + l_new[i * cdisp_step1] = l_cur[id * cdisp_step2]; + r_new[i * cdisp_step1] = r_cur[id * cdisp_step2]; + data_cost_new[id * cdisp_step1] = FLT_MAX; + + } +} +__kernel void init_message_0(__global short *u_new_, __global short *d_new_, __global short *l_new_, + __global short *r_new_, __global short *u_cur_, __global const short *d_cur_, + __global const short *l_cur_, __global const short *r_cur_, __global short *ctemp, + __global short *selected_disp_pyr_new, __global const short *selected_disp_pyr_cur, + __global short *data_cost_selected_, __global const short *data_cost_, + int h, int w, int nr_plane, int h2, int w2, int nr_plane2, + int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global const short *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2; + __global const short *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2; + __global const short *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1); + __global const short *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1); + + __global short *data_cost_new = ctemp + y * cmsg_step1 + x; + + __global const short *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2; + __global const short *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane2; d++) + { + int idx2 = d * cdisp_step2; + + short val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2]; + data_cost_new[d * cdisp_step1] = val; + } + + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x; + + __global short *u_new = u_new_ + y * cmsg_step1 + x; + __global short *d_new = d_new_ + y * cmsg_step1 + x; + __global short *l_new = l_new_ + y * cmsg_step1 + x; + __global short *r_new = r_new_ + y * cmsg_step1 + x; + + u_cur = u_cur_ + y/2 * cmsg_step2 + x/2; + d_cur = d_cur_ + y/2 * cmsg_step2 + x/2; + l_cur = l_cur_ + y/2 * cmsg_step2 + x/2; + r_cur = r_cur_ + y/2 * cmsg_step2 + x/2; + + get_first_k_element_increase_0(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur, + data_cost_selected, disparity_selected_new, data_cost_new, + data_cost, disparity_selected_cur, nr_plane, nr_plane2, + cdisp_step1, cdisp_step2); + } +} +__kernel void init_message_1(__global float *u_new_, __global float *d_new_, __global float *l_new_, + __global float *r_new_, __global float *u_cur_, __global const float *d_cur_, + __global const float *l_cur_, __global const float *r_cur_, __global float *ctemp, + __global float *selected_disp_pyr_new, __global const float *selected_disp_pyr_cur, + __global float *data_cost_selected_, __global const float *data_cost_, + int h, int w, int nr_plane, int h2, int w2, int nr_plane2, + int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global const float *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2; + __global const float *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2; + __global const float *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1); + __global const float *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1); + + __global float *data_cost_new = ctemp + y * cmsg_step1 + x; + + __global const float *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2; + __global const float *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane2; d++) + { + int idx2 = d * cdisp_step2; + + float val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2]; + data_cost_new[d * cdisp_step1] = val; + } + + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x; + + __global float *u_new = u_new_ + y * cmsg_step1 + x; + __global float *d_new = d_new_ + y * cmsg_step1 + x; + __global float *l_new = l_new_ + y * cmsg_step1 + x; + __global float *r_new = r_new_ + y * cmsg_step1 + x; + + u_cur = u_cur_ + y/2 * cmsg_step2 + x/2; + d_cur = d_cur_ + y/2 * cmsg_step2 + x/2; + l_cur = l_cur_ + y/2 * cmsg_step2 + x/2; + r_cur = r_cur_ + y/2 * cmsg_step2 + x/2; + + get_first_k_element_increase_1(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur, + data_cost_selected, disparity_selected_new, data_cost_new, + data_cost, disparity_selected_cur, nr_plane, nr_plane2, + cdisp_step1, cdisp_step2); + } +} +/////////////////////////////////////////////////////////////// +//////////////////// calc all iterations ///////////////////// +/////////////////////////////////////////////////////////////// +void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1, + __global const short *msg2, __global const short *msg3, + __global const short *dst_disp, __global const short *src_disp, + int nr_plane, __global short *temp, + float cmax_disc_term, int cdisp_step1, float cdisc_single_jump) +{ + short minimum = SHRT_MAX; + for(int d = 0; d < nr_plane; d++) + { + int idx = d * cdisp_step1; + short val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx]; + + if(val < minimum) + minimum = val; + + msg_dst[idx] = val; + } + + float sum = 0; + for(int d = 0; d < nr_plane; d++) + { + float cost_min = minimum + cmax_disc_term; + short src_disp_reg = src_disp[d * cdisp_step1]; + + for(int d2 = 0; d2 < nr_plane; d2++) + cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] + + cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg))); + + temp[d * cdisp_step1] = convert_short_sat_rte(cost_min); + sum += cost_min; + } + sum /= nr_plane; + + for(int d = 0; d < nr_plane; d++) + msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum); +} +void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1, + __global const float *msg2, __global const float *msg3, + __global const float *dst_disp, __global const float *src_disp, + int nr_plane, __global float *temp, + float cmax_disc_term, int cdisp_step1, float cdisc_single_jump) +{ + float minimum = FLT_MAX; + for(int d = 0; d < nr_plane; d++) + { + int idx = d * cdisp_step1; + float val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx]; + + if(val < minimum) + minimum = val; + + msg_dst[idx] = val; + } + + float sum = 0; + for(int d = 0; d < nr_plane; d++) + { + float cost_min = minimum + cmax_disc_term; + float src_disp_reg = src_disp[d * cdisp_step1]; + + for(int d2 = 0; d2 < nr_plane; d2++) + cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] + + cdisc_single_jump * fabs(dst_disp[d2 * cdisp_step1] - src_disp_reg))); + + temp[d * cdisp_step1] = cost_min; + sum += cost_min; + } + sum /= nr_plane; + + for(int d = 0; d < nr_plane; d++) + msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum; +} +__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_, + __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur, + __global short *ctemp, int h, int w, int nr_plane, int i, + float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump) +{ + int y = get_global_id(1); + int x = ((get_global_id(0)) << 1) + ((y + i) & 1); + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + __global const short *data = data_cost_selected + y * cmsg_step1 + x; + + __global short *u = u_ + y * cmsg_step1 + x; + __global short *d = d_ + y * cmsg_step1 + x; + __global short *l = l_ + y * cmsg_step1 + x; + __global short *r = r_ + y * cmsg_step1 + x; + + __global const short *disp = selected_disp_pyr_cur + y * cmsg_step1 + x; + + __global short *temp = ctemp + y * cmsg_step1 + x; + + message_per_pixel_0(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + } +} +__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_, + __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur, + __global float *ctemp, int h, int w, int nr_plane, int i, + float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump) +{ + int y = get_global_id(1); + int x = ((get_global_id(0)) << 1) + ((y + i) & 1); + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + __global const float *data = data_cost_selected + y * cmsg_step1 + x; + + __global float *u = u_ + y * cmsg_step1 + x; + __global float *d = d_ + y * cmsg_step1 + x; + __global float *l = l_ + y * cmsg_step1 + x; + __global float *r = r_ + y * cmsg_step1 + x; + + __global const float *disp = selected_disp_pyr_cur + y * cmsg_step1 + x; + __global float *temp = ctemp + y * cmsg_step1 + x; + + message_per_pixel_1(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + } +} + +/////////////////////////////////////////////////////////////// +/////////////////////////// output //////////////////////////// +/////////////////////////////////////////////////////////////// +__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_, + __global const short *r_, __global const short * data_cost_selected, + __global const short *disp_selected_pyr, + __global short* disp, + int res_step, int cols, int rows, int nr_plane, + int cmsg_step1, int cdisp_step1) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1) + { + __global const short *data = data_cost_selected + y * cmsg_step1 + x; + __global const short *disp_selected = disp_selected_pyr + y * cmsg_step1 + x; + + __global const short *u = u_ + (y+1) * cmsg_step1 + (x+0); + __global const short *d = d_ + (y-1) * cmsg_step1 + (x+0); + __global const short *l = l_ + (y+0) * cmsg_step1 + (x+1); + __global const short *r = r_ + (y+0) * cmsg_step1 + (x-1); + + short best = 0; + short best_val = SHRT_MAX; + + for (int i = 0; i < nr_plane; ++i) + { + int idx = i * cdisp_step1; + short val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx]; + + if (val < best_val) + { + best_val = val; + best = disp_selected[idx]; + } + } + disp[res_step * y + x] = best; + } +} +__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_, + __global const float *r_, __global const float *data_cost_selected, + __global const float *disp_selected_pyr, + __global short *disp, + int res_step, int cols, int rows, int nr_plane, + int cmsg_step1, int cdisp_step1) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1) + { + __global const float *data = data_cost_selected + y * cmsg_step1 + x; + __global const float *disp_selected = disp_selected_pyr + y * cmsg_step1 + x; + + __global const float *u = u_ + (y+1) * cmsg_step1 + (x+0); + __global const float *d = d_ + (y-1) * cmsg_step1 + (x+0); + __global const float *l = l_ + (y+0) * cmsg_step1 + (x+1); + __global const float *r = r_ + (y+0) * cmsg_step1 + (x-1); + + short best = 0; + short best_val = SHRT_MAX; + for (int i = 0; i < nr_plane; ++i) + { + int idx = i * cdisp_step1; + float val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx]; + + if (val < best_val) + { + best_val = val; + best = convert_short_sat_rte(disp_selected[idx]); + } + } + disp[res_step * y + x] = best; + } +} + diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp new file mode 100644 index 0000000..f18b6ba --- /dev/null +++ b/modules/ocl/src/stereo_csbp.cpp @@ -0,0 +1,792 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// Jin Ma, jin@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#if !defined (HAVE_OPENCL) + +namespace cv +{ + namespace ocl + { + + void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &) + { + throw_nogpu(); + } + cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) + { + throw_nogpu(); + } + cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, + float, float, int, int) + { + throw_nogpu(); + } + + void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &) + { + throw_nogpu(); + } + } +} + +#else /* !defined (HAVE_OPENCL) */ + +namespace cv +{ + namespace ocl + { + + ///////////////////////////OpenCL kernel strings/////////////////////////// + extern const char *stereocsbp; + } + +} +namespace cv +{ + namespace ocl + { + namespace stereoCSBP + { + ////////////////////////////////////////////////////////////////////////// + //////////////////////////////common//////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + static inline int divUp(int total, int grain) + { + return (total + grain - 1) / grain; + } + static string get_kernel_name(string kernel_name, int data_type) + { + stringstream idxStr; + if(data_type == CV_16S) + idxStr << "0"; + else + idxStr << "1"; + kernel_name += idxStr.str(); + + return kernel_name; + } + using cv::ocl::StereoConstantSpaceBP; + ////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////init_data_cost////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////// + static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp, + StereoConstantSpaceBP &rthis, + int msg_step, int h, int w, int level) + { + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + + string kernelName = get_kernel_name("init_data_cost_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8 ,1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int cdisp_step1 = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp, + StereoConstantSpaceBP &rthis, + int msg_step, int h, int w, int level) + { + + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + int win_size = (int)std::pow(2.f, level); + + string kernelName = get_kernel_name("init_data_cost_reduce_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + const int threadsNum = 256; + //size_t blockSize = threadsNum; + size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; + size_t globalThreads[3] = {w *localThreads[0], + h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2] + }; + + int local_mem_size = threadsNum * sizeof(float); + int cdisp_step1 = msg_step * h; + + openCLVerifyKernel(clCxt, kernel, localThreads); + + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, + oclMat &temp, StereoConstantSpaceBP &rthis, + int h, int w, int nr_plane, int msg_step) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("get_first_k_initial_local_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8 ,1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, + oclMat &temp, StereoConstantSpaceBP &rthis, + int h, int w, int nr_plane, int msg_step) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("get_first_k_initial_global_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis, + uchar *disp_selected_pyr, uchar *data_cost_selected, + size_t msg_step, int h, int w, int level, int nr_plane) + { + + if(level <= 1) + init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level); + else + init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level); + + if(rthis.use_local_init_data_cost == true) + { + get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); + } + else + { + get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, + nr_plane, msg_step); + } + } + + /////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////compute_data_cost////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost, + StereoConstantSpaceBP &rthis, int msg_step1, + int msg_step2, const oclMat &left, const oclMat &right, int h, + int w, int h2, int level, int nr_plane) + { + Context *clCxt = left.clCxt; + int channels = left.oclchannels(); + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("compute_data_cost_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost, + StereoConstantSpaceBP &rthis, int msg_step1, + int msg_step2, const oclMat &left, const oclMat &right, int h, + int w, int h2, int level, int nr_plane) + { + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + int win_size = (int)std::pow(2.f, level); + + string kernelName = get_kernel_name("compute_data_cost_reduce_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + const size_t threadsNum = 256; + //size_t blockSize = threadsNum; + size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; + size_t globalThreads[3] = {w *localThreads[0], + h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2] + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + size_t local_mem_size = threadsNum * sizeof(float); + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis, + int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w, + int h2, int level, int nr_plane) + { + if(level <= 1) + compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, + left, right, h, w, h2, level, nr_plane); + else + compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, + left, right, h, w, h2, level, nr_plane); + } + //////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////init message////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new, + uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur, + uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur, + uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis, + size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane, + int h2, int w2, int nr_plane2) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("init_message_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2)); + openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2)); + openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2)); + openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + //////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////calc_all_iterations//////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////// + static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, + int msg_step, int h, int w, int nr_plane, int i) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("compute_message_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, + int msg_step, int h, int w, int nr_plane) + { + for(int t = 0; t < rthis.iters; t++) + calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis, + msg_step, h, w, nr_plane, t & 1); + } + + /////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////compute_disp//////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////// + static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step, + oclMat &disp, int nr_plane) + { + Context *clCxt = disp.clCxt; + int data_type = rthis.msg_type; + + string kernelName = get_kernel_name("compute_disp_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0], + divUp(disp.rows, localThreads[1]) *localThreads[1], + 1 + }; + + int step_size = disp.step / disp.elemSize(); + int disp_step = disp.rows * msg_step; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + } + } +} +namespace +{ + const float DEFAULT_MAX_DATA_TERM = 30.0f; + const float DEFAULT_DATA_WEIGHT = 1.0f; + const float DEFAULT_MAX_DISC_TERM = 160.0f; + const float DEFAULT_DISC_SINGLE_JUMP = 10.0f; + + template + void print_gpu_mat(const oclMat &mat) + { + T *data_1 = new T[mat.rows * mat.cols * mat.channels()]; + Context *clCxt = mat.clCxt; + int status = clEnqueueReadBuffer(clCxt -> impl->clCmdQueue, (cl_mem)mat.data, CL_TRUE, 0, + mat.rows * mat.cols * mat.channels() * sizeof(T), data_1, 0, NULL, NULL); + + if(status != CL_SUCCESS) + cout << "error " << status << endl; + + cout << ".........................................................." << endl; + cout << "elemSize() " << mat.elemSize() << endl; + cout << "elemSize() " << mat.elemSize1() << endl; + cout << "channels: " << mat.channels() << endl; + cout << "rows: " << mat.rows << endl; + cout << "cols: " << mat.cols << endl; + + for(int i = 0; i < 100; i++) + { + for(int j = 0; j < 30; j++) + { + cout << (int)data_1[i * mat.cols * mat.channels() + j] << " "; + } + cout << endl; + } + } +} + + +void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane) +{ + ndisp = (int) ((float) width / 3.14f); + if ((ndisp & 1) != 0) + ndisp++; + + int mm = ::max(width, height); + iters = mm / 100 + ((mm > 1200) ? - 4 : 4); + + levels = (int)::log(static_cast(mm)) * 2 / 3; + if (levels == 0) levels++; + + nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1)); +} + +cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, + int msg_type_) + + : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), + max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT), + max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0), + msg_type(msg_type_), use_local_init_data_cost(true) +{ + CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); +} + + +cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, + float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, + int min_disp_th_, int msg_type_) + : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), + max_data_term(max_data_term_), data_weight(data_weight_), + max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_), + msg_type(msg_type_), use_local_init_data_cost(true) +{ + CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); +} + +template +static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], + oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, + oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp) +{ + CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane + && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); + + CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3)); + + const Scalar zero = Scalar::all(0); + + ////////////////////////////////////Init/////////////////////////////////////////////////// + int rows = left.rows; + int cols = left.cols; + + rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); + int levels = rthis.levels; + + AutoBuffer buf(levels * 4); + + int *cols_pyr = buf; + int *rows_pyr = cols_pyr + levels; + int *nr_plane_pyr = rows_pyr + levels; + int *step_pyr = nr_plane_pyr + levels; + + cols_pyr[0] = cols; + rows_pyr[0] = rows; + nr_plane_pyr[0] = rthis.nr_plane; + + const int n = 64; + step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T); + for (int i = 1; i < levels; i++) + { + cols_pyr[i] = cols_pyr[i - 1] / 2; + rows_pyr[i] = rows_pyr[i - 1]/ 2; + + nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2; + + step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T); + } + + Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]); + Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2); + + u[0].create(msg_size, DataType::type); + d[0].create(msg_size, DataType::type); + l[0].create(msg_size, DataType::type); + r[0].create(msg_size, DataType::type); + + u[1].create(msg_size, DataType::type); + d[1].create(msg_size, DataType::type); + l[1].create(msg_size, DataType::type); + r[1].create(msg_size, DataType::type); + + disp_selected_pyr[0].create(msg_size, DataType::type); + disp_selected_pyr[1].create(msg_size, DataType::type); + + data_cost.create(data_cost_size, DataType::type); + data_cost_selected.create(msg_size, DataType::type); + + Size temp_size = data_cost_size; + if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp) + temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp); + + temp.create(temp_size, DataType::type); + temp = zero; + + ///////////////////////////////// Compute//////////////////////////////////////////////// + + //csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, + // rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); + + l[0] = zero; + d[0] = zero; + r[0] = zero; + u[0] = zero; + disp_selected_pyr[0] = zero; + + l[1] = zero; + d[1] = zero; + r[1] = zero; + u[1] = zero; + disp_selected_pyr[1] = zero; + + data_cost = zero; + + data_cost_selected = zero; + + int cur_idx = 0; + + for (int i = levels - 1; i >= 0; i--) + { + if (i == levels - 1) + { + cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data, + data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i], + i, nr_plane_pyr[i]); + } + else + { + cv::ocl::stereoCSBP::compute_data_cost( + disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0], + step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i, + nr_plane_pyr[i + 1]); + + int new_idx = (cur_idx + 1) & 1; + + cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data, + u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data, + data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0], + step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1], + cols_pyr[i + 1], nr_plane_pyr[i + 1]); + cur_idx = new_idx; + } + cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp, + rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]); + } + + if (disp.empty()) + disp.create(rows, cols, CV_16S); + + out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); + out = zero; + + stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0], + out, nr_plane_pyr[0]); + if (disp.type() != CV_16S) + out.convertTo(disp, disp.type()); +} + + +typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], + oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, + oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp); + +const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator, 0, csbp_operator, 0, 0}; + +void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp) +{ + + CV_Assert(msg_type == CV_32F || msg_type == CV_16S); + operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, + left, right, disp); +} + +#endif /* !defined (HAVE_OPENCL) */ diff --git a/modules/ocl/test/test_calib3d.cpp b/modules/ocl/test/test_calib3d.cpp index 179829e..befc059 100644 --- a/modules/ocl/test/test_calib3d.cpp +++ b/modules/ocl/test/test_calib3d.cpp @@ -59,7 +59,7 @@ PARAM_TEST_CASE(StereoMatchBM, int, int) virtual void SetUp() { n_disp = GET_PARAM(0); - winSize = GET_PARAM(1); + winSize = GET_PARAM(1); } }; @@ -69,27 +69,27 @@ TEST_P(StereoMatchBM, Regression) Mat left_image = readImage("stereobm/aloe-L.png", IMREAD_GRAYSCALE); Mat right_image = readImage("stereobm/aloe-R.png", IMREAD_GRAYSCALE); Mat disp_gold = readImage("stereobm/aloe-disp.png", IMREAD_GRAYSCALE); - ocl::oclMat d_left, d_right; - ocl::oclMat d_disp(left_image.size(), CV_8U); - Mat disp; + ocl::oclMat d_left, d_right; + ocl::oclMat d_disp(left_image.size(), CV_8U); + Mat disp; ASSERT_FALSE(left_image.empty()); ASSERT_FALSE(right_image.empty()); ASSERT_FALSE(disp_gold.empty()); - d_left.upload(left_image); - d_right.upload(right_image); + d_left.upload(left_image); + d_right.upload(right_image); ocl::StereoBM_OCL bm(0, n_disp, winSize); bm(d_left, d_right, d_disp); - d_disp.download(disp); + d_disp.download(disp); EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3); } INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128), - testing::Values(19))); + testing::Values(19))); PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float) { @@ -134,4 +134,64 @@ TEST_P(StereoMatchBP, Regression) INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64), testing::Values(8),testing::Values(2),testing::Values(25.0f), testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f))); + +////////////////////////////////////////////////////////////////////////// +// ConstSpaceBeliefPropagation +PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int) +{ + int ndisp_; + int iters_; + int levels_; + int nr_plane_; + float max_data_term_; + float data_weight_; + float max_disc_term_; + float disc_single_jump_; + int min_disp_th_; + int msg_type_; + + virtual void SetUp() + { + ndisp_ = GET_PARAM(0); + iters_ = GET_PARAM(1); + levels_ = GET_PARAM(2); + nr_plane_ = GET_PARAM(3); + max_data_term_ = GET_PARAM(4); + data_weight_ = GET_PARAM(5); + max_disc_term_ = GET_PARAM(6); + disc_single_jump_ = GET_PARAM(7); + min_disp_th_ = GET_PARAM(8); + msg_type_ = GET_PARAM(9); + } +}; +TEST_P(StereoMatchConstSpaceBP, Regression) +{ + Mat left_image = readImage("csstereobp/aloe-L.png"); + Mat right_image = readImage("csstereobp/aloe-R.png"); + Mat disp_gold = readImage("csstereobp/aloe-disp.png", IMREAD_GRAYSCALE); + + ocl::oclMat d_left, d_right; + ocl::oclMat d_disp; + + Mat disp; + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + d_left.upload(left_image); + d_right.upload(right_image); + + ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_, + max_disc_term_, disc_single_jump_, 0, CV_32F); + bp(d_left, d_right, d_disp); + d_disp.download(disp); + disp.convertTo(disp, disp_gold.depth()); + + EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4); + //EXPECT_MAT_NEAR(disp_gold, disp, 1.0, ""); +} +INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128), + testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f), + testing::Values(1.0f),testing::Values(160.0f), + testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F))); #endif // HAVE_OPENCL -- 2.7.4