1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
12 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Jia Haipeng, jiahaipeng95@gmail.com
19 // Peng Xiao, pengxiao@outlook.com
21 // Redistribution and use in source and binary forms, with or without modification,
22 // are permitted provided that the following conditions are met:
24 // * Redistribution's of source code must retain the above copyright notice,
25 // this list of conditions and the following disclaimer.
27 // * Redistribution's in binary form must reproduce the above copyright notice,
28 // this list of conditions and the following disclaimer in the documentation
29 // and/or other materials provided with the distribution.
31 // * The name of the copyright holders may not be used to endorse or promote products
32 // derived from this software without specific prior written permission.
34 // This software is provided by the copyright holders and contributors as is and
35 // any express or implied warranties, including, but not limited to, the implied
36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
37 // In no event shall the Intel Corporation or contributors be liable for any direct,
38 // indirect, incidental, special, exemplary, or consequential damages
39 // (including, but not limited to, procurement of substitute goods or services;
40 // loss of use, data, or profits; or business interruption) however caused
41 // and on any theory of liability, whether in contract, strict liability,
42 // or tort (including negligence or otherwise) arising in any way out of
43 // the use of this software, even if advised of the possibility of such damage.
47 #if defined (DOUBLE_SUPPORT)
50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
51 #elif defined (cl_amd_fp64)
52 #pragma OPENCL EXTENSION cl_amd_fp64:enable
65 ///////////////////////////////////////////////////////////////
66 /////////////////common///////////////////////////////////////
67 /////////////////////////////////////////////////////////////
68 inline T saturate_cast(float v){
70 return convert_short_sat_rte(v);
76 inline T4 saturate_cast4(float4 v){
78 return convert_short4_sat_rte(v);
84 #define FLOAT_MAX 3.402823466e+38f
91 float cdisc_single_jump;
93 ///////////////////////////////////////////////////////////////
94 ////////////////////////// comp data //////////////////////////
95 ///////////////////////////////////////////////////////////////
97 inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
99 return abs((int)(l.x) - *rs);
102 static float pix_diff_4(const uchar4 l, __global const uchar *rs)
105 r = *((__global uchar4 *)rs);
107 const float tr = 0.299f;
108 const float tg = 0.587f;
109 const float tb = 0.114f;
113 val = tb * abs((int)l.x - r.x);
114 val += tg * abs((int)l.y - r.y);
115 val += tr * abs((int)l.z - r.z);
120 inline float pix_diff_3(const uchar4 l, __global const uchar *rs)
122 return pix_diff_4(l, rs);
133 #define CAT(X,Y) X##Y
134 #define CAT2(X,Y) CAT(X,Y)
136 #define PIX_DIFF CAT2(pix_diff_, CN)
138 __kernel void comp_data(__global uchar *left, int left_rows, int left_cols, int left_step,
139 __global uchar *right, int right_step,
140 __global T *data, int data_step,
141 __constant con_srtuct_t *con_st)
143 int x = get_global_id(0);
144 int y = get_global_id(1);
146 if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
148 data_step /= sizeof(T);
149 const __global uchar* ls = left + y * left_step + x * CN;
150 const __global uchar* rs = right + y * right_step + x * CN;
152 __global T *ds = data + y * data_step + x;
154 const unsigned int disp_step = data_step * left_rows;
155 const float weightXterm = con_st -> cdata_weight * con_st -> cmax_data_term;
156 const uchar4 ls_data = vload4(0, ls);
158 for (int disp = 0; disp < con_st -> cndisp; disp++)
163 val = PIX_DIFF(ls_data, rs - disp * CN);
164 ds[disp * disp_step] = saturate_cast(fmin(con_st -> cdata_weight * val, weightXterm));
168 ds[disp * disp_step] = saturate_cast(weightXterm);
174 ///////////////////////////////////////////////////////////////
175 //////////////////////// data step down ///////////////////////
176 ///////////////////////////////////////////////////////////////
177 __kernel void data_step_down(__global T *src, int src_rows,
178 __global T *dst, int dst_rows, int dst_cols,
179 int src_step, int dst_step,
182 const int x = get_global_id(0);
183 const int y = get_global_id(1);
185 if (x < dst_cols && y < dst_rows)
187 src_step /= sizeof(T);
188 dst_step /= sizeof(T);
189 int4 coor_step = (int4)(src_rows * src_step);
190 int4 coor = (int4)(min(2*y+0, src_rows-1) * src_step + 2*x+0,
191 min(2*y+1, src_rows-1) * src_step + 2*x+0,
192 min(2*y+0, src_rows-1) * src_step + 2*x+1,
193 min(2*y+1, src_rows-1) * src_step + 2*x+1);
195 for (int d = 0; d < cndisp; ++d)
198 dst_reg = src[coor.x];
199 dst_reg += src[coor.y];
200 dst_reg += src[coor.z];
201 dst_reg += src[coor.w];
204 dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
209 ///////////////////////////////////////////////////////////////
210 /////////////////// level up messages ////////////////////////
211 ///////////////////////////////////////////////////////////////
212 __kernel void level_up_message(__global T *src, int src_rows, int src_step,
213 __global T *dst, int dst_rows, int dst_cols, int dst_step,
216 const int x = get_global_id(0);
217 const int y = get_global_id(1);
219 if (x < dst_cols && y < dst_rows)
221 src_step /= sizeof(T);
222 dst_step /= sizeof(T);
224 const int dst_disp_step = dst_step * dst_rows;
225 const int src_disp_step = src_step * src_rows;
227 __global T *dstr = dst + y * dst_step + x;
228 __global const T *srcr = src + (y / 2 * src_step) + (x / 2);
230 for (int d = 0; d < cndisp; ++d)
231 dstr[d * dst_disp_step] = srcr[d * src_disp_step];
235 ///////////////////////////////////////////////////////////////
236 //////////////////// calc all iterations /////////////////////
237 ///////////////////////////////////////////////////////////////
238 static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
239 const __global T *dt,
240 int u_step, int msg_disp_step, int data_disp_step,
241 float4 cmax_disc_term, float4 cdisc_single_jump)
243 __global T *us = us_ + u_step;
244 __global T *ds = ds_ - u_step;
245 __global T *ls = ls_ + 1;
246 __global T *rs = rs_ - 1;
248 float4 minimum = (float4)(FLOAT_MAX);
261 prev = (float4)(t_us + t_ls + t_rs + t_dt,
262 t_ds + t_ls + t_rs + t_dt,
263 t_us + t_ds + t_rs + t_dt,
264 t_us + t_ds + t_ls + t_dt);
266 minimum = min(prev, minimum);
268 t_dst[0] = saturate_cast4(prev);
270 for(int i = 1, idx = msg_disp_step; i < CNDISP; ++i, idx+=msg_disp_step)
276 t_dt = dt[data_disp_step * i];
278 dst_reg = (float4)(t_us + t_ls + t_rs + t_dt,
279 t_ds + t_ls + t_rs + t_dt,
280 t_us + t_ds + t_rs + t_dt,
281 t_us + t_ds + t_ls + t_dt);
283 minimum = min(dst_reg, minimum);
285 prev += cdisc_single_jump;
286 prev = min(prev, dst_reg);
288 t_dst[i] = saturate_cast4(prev);
291 minimum += cmax_disc_term;
293 float4 sum = (float4)(0);
294 prev = convert_float4(t_dst[CNDISP - 1]);
295 for (int disp = CNDISP - 2; disp >= 0; disp--)
297 prev += cdisc_single_jump;
298 cur = convert_float4(t_dst[disp]);
299 prev = min(prev, cur);
300 cur = min(prev, minimum);
303 t_dst[disp] = saturate_cast4(cur);
306 dst_reg = convert_float4(t_dst[CNDISP - 1]);
307 dst_reg = min(dst_reg, minimum);
308 t_dst[CNDISP - 1] = saturate_cast4(dst_reg);
311 sum /= (float4)(CNDISP);
313 for(int i = 0, idx = 0; i < CNDISP; ++i, idx+=msg_disp_step)
316 us_[idx] = dst.x - sum.x;
317 ds_[idx] = dst.y - sum.y;
318 rs_[idx] = dst.z - sum.z;
319 ls_[idx] = dst.w - sum.w;
322 __kernel void one_iteration(__global T *u, int u_step,
323 __global T *data, int data_step,
324 __global T *d, __global T *l, __global T *r,
325 int t, int cols, int rows,
326 float cmax_disc_term, float cdisc_single_jump)
328 const int y = get_global_id(1);
329 const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
331 if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
334 data_step /= sizeof(T);
336 __global T *us = u + y * u_step + x;
337 __global T *ds = d + y * u_step + x;
338 __global T *ls = l + y * u_step + x;
339 __global T *rs = r + y * u_step + x;
340 const __global T *dt = data + y * data_step + x;
342 int msg_disp_step = u_step * rows;
343 int data_disp_step = data_step * rows;
345 message(us, ds, ls, rs, dt,
346 u_step, msg_disp_step, data_disp_step,
347 (float4)(cmax_disc_term), (float4)(cdisc_single_jump));
351 ///////////////////////////////////////////////////////////////
352 /////////////////////////// output ////////////////////////////
353 ///////////////////////////////////////////////////////////////
354 __kernel void output(const __global T *u, int u_step,
355 const __global T *d, const __global T *l,
356 const __global T *r, const __global T *data,
357 __global T *disp, int disp_rows, int disp_cols, int disp_step,
360 const int x = get_global_id(0);
361 const int y = get_global_id(1);
363 if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
366 disp_step /= sizeof(T);
367 const __global T *us = u + (y + 1) * u_step + x;
368 const __global T *ds = d + (y - 1) * u_step + x;
369 const __global T *ls = l + y * u_step + (x + 1);
370 const __global T *rs = r + y * u_step + (x - 1);
371 const __global T *dt = data + y * u_step + x;
373 int disp_steps = disp_rows * u_step;
376 float best_val = FLOAT_MAX;
377 for (int d = 0; d < cndisp; ++d)
380 val = us[d * disp_steps];
381 val += ds[d * disp_steps];
382 val += ls[d * disp_steps];
383 val += rs[d * disp_steps];
384 val += dt[d * disp_steps];
393 (disp + y * disp_step)[x] = convert_short_sat(best);