modules/ocl/src/opencl/stereobp.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Jia Haipeng, jiahaipeng95@gmail.com
  19 //    Peng Xiao,   pengxiao@outlook.com
  20 //
  21 // Redistribution and use in source and binary forms, with or without modification,
  22 // are permitted provided that the following conditions are met:
  23 //
  24 //   * Redistribution's of source code must retain the above copyright notice,
  25 //     this list of conditions and the following disclaimer.
  26 //
  27 //   * Redistribution's in binary form must reproduce the above copyright notice,
  28 //     this list of conditions and the following disclaimer in the documentation
  29 //     and/or other materials provided with the distribution.
  30 //
  31 //   * The name of the copyright holders may not be used to endorse or promote products
  32 //     derived from this software without specific prior written permission.
  33 //
  34 // This software is provided by the copyright holders and contributors as is and
  35 // any express or implied warranties, including, but not limited to, the implied
  36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  37 // In no event shall the Intel Corporation or contributors be liable for any direct,
  38 // indirect, incidental, special, exemplary, or consequential damages
  39 // (including, but not limited to, procurement of substitute goods or services;
  40 // loss of use, data, or profits; or business interruption) however caused
  41 // and on any theory of liability, whether in contract, strict liability,
  42 // or tort (including negligence or otherwise) arising in any way out of
  43 // the use of this software, even if advised of the possibility of such damage.
  44 //
  45 //M*/
  46
  47 #if defined (DOUBLE_SUPPORT)
  48
  49 #ifdef cl_khr_fp64
  50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  51 #elif defined (cl_amd_fp64)
  52 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  53 #endif
  54
  55 #endif
  56
  57 #ifdef T_FLOAT
  58 #define T float
  59 #define T4 float4
  60 #else
  61 #define T short
  62 #define T4 short4
  63 #endif
  64
  65 ///////////////////////////////////////////////////////////////
  66 /////////////////common///////////////////////////////////////
  67 /////////////////////////////////////////////////////////////
  68 inline T saturate_cast(float v){
  69 #ifdef T_SHORT
  70     return convert_short_sat_rte(v);
  71 #else
  72     return v;
  73 #endif
  74 }
  75
  76 inline T4 saturate_cast4(float4 v){
  77 #ifdef T_SHORT
  78     return convert_short4_sat_rte(v);
  79 #else
  80     return v;
  81 #endif
  82 }
  83
  84 #define FLOAT_MAX 3.402823466e+38f
  85 typedef struct
  86 {
  87     int   cndisp;
  88     float cmax_data_term;
  89     float cdata_weight;
  90     float cmax_disc_term;
  91     float cdisc_single_jump;
  92 }con_srtuct_t;
  93 ///////////////////////////////////////////////////////////////
  94 ////////////////////////// comp data //////////////////////////
  95 ///////////////////////////////////////////////////////////////
  96
  97 inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
  98 {
  99     return abs((int)(l.x) - *rs);
 100 }
 101
 102 static float pix_diff_4(const uchar4 l, __global const uchar *rs)
 103 {
 104     uchar4 r;
 105     r = *((__global uchar4 *)rs);
 106
 107     const float tr = 0.299f;
 108     const float tg = 0.587f;
 109     const float tb = 0.114f;
 110
 111     float val;
 112
 113     val  = tb * abs((int)l.x - r.x);
 114     val += tg * abs((int)l.y - r.y);
 115     val += tr * abs((int)l.z - r.z);
 116
 117     return val;
 118 }
 119
 120 inline float pix_diff_3(const uchar4 l, __global const uchar *rs)
 121 {
 122     return pix_diff_4(l, rs);
 123 }
 124
 125 #ifndef CN
 126 #define CN 4
 127 #endif
 128
 129 #ifndef CNDISP
 130 #define CNDISP 64
 131 #endif
 132
 133 #define CAT(X,Y) X##Y
 134 #define CAT2(X,Y) CAT(X,Y)
 135
 136 #define PIX_DIFF CAT2(pix_diff_, CN)
 137
 138 __kernel void comp_data(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
 139                         __global uchar *right, int right_step,
 140                         __global T *data, int data_step,
 141                         __constant con_srtuct_t *con_st)
 142 {
 143     int x = get_global_id(0);
 144     int y = get_global_id(1);
 145
 146     if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
 147     {
 148         data_step /= sizeof(T);
 149         const __global uchar* ls = left  + y * left_step  + x * CN;
 150         const __global uchar* rs = right + y * right_step + x * CN;
 151
 152         __global T *ds = data + y * data_step + x;
 153
 154         const unsigned int disp_step = data_step * left_rows;
 155         const float weightXterm = con_st -> cdata_weight * con_st -> cmax_data_term;
 156         const uchar4 ls_data = vload4(0, ls);
 157
 158         for (int disp = 0; disp < con_st -> cndisp; disp++)
 159         {
 160             if (x - disp >= 1)
 161             {
 162                 float val = 0;
 163                 val = PIX_DIFF(ls_data, rs - disp * CN);
 164                 ds[disp * disp_step] =  saturate_cast(fmin(con_st -> cdata_weight * val, weightXterm));
 165             }
 166             else
 167             {
 168                 ds[disp * disp_step] =  saturate_cast(weightXterm);
 169             }
 170         }
 171     }
 172 }
 173
 174 ///////////////////////////////////////////////////////////////
 175 //////////////////////// data step down ///////////////////////
 176 ///////////////////////////////////////////////////////////////
 177 __kernel void data_step_down(__global T *src, int src_rows,
 178                              __global T *dst, int dst_rows, int dst_cols,
 179                              int src_step, int dst_step,
 180                              int cndisp)
 181 {
 182     const int x = get_global_id(0);
 183     const int y = get_global_id(1);
 184
 185     if (x < dst_cols && y < dst_rows)
 186     {
 187         src_step /= sizeof(T);
 188         dst_step /= sizeof(T);
 189         int4 coor_step = (int4)(src_rows * src_step);
 190         int4 coor = (int4)(min(2*y+0, src_rows-1) * src_step + 2*x+0,
 191                            min(2*y+1, src_rows-1) * src_step + 2*x+0,
 192                            min(2*y+0, src_rows-1) * src_step + 2*x+1,
 193                            min(2*y+1, src_rows-1) * src_step + 2*x+1);
 194
 195         for (int d = 0; d < cndisp; ++d)
 196         {
 197             float dst_reg;
 198             dst_reg  = src[coor.x];
 199             dst_reg += src[coor.y];
 200             dst_reg += src[coor.z];
 201             dst_reg += src[coor.w];
 202             coor += coor_step;
 203
 204             dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
 205         }
 206     }
 207 }
 208
 209 ///////////////////////////////////////////////////////////////
 210 /////////////////// level up messages  ////////////////////////
 211 ///////////////////////////////////////////////////////////////
 212 __kernel void level_up_message(__global T *src, int src_rows, int src_step,
 213                                __global T *dst, int dst_rows, int dst_cols, int dst_step,
 214                                int cndisp)
 215 {
 216     const int x = get_global_id(0);
 217     const int y = get_global_id(1);
 218
 219     if (x < dst_cols && y < dst_rows)
 220     {
 221         src_step /= sizeof(T);
 222         dst_step /= sizeof(T);
 223
 224         const int dst_disp_step = dst_step * dst_rows;
 225         const int src_disp_step = src_step * src_rows;
 226
 227         __global T       *dstr = dst + y * dst_step + x;
 228         __global const T *srcr = src + (y / 2 * src_step) + (x / 2);
 229
 230         for (int d = 0; d < cndisp; ++d)
 231             dstr[d * dst_disp_step] = srcr[d * src_disp_step];
 232     }
 233 }
 234
 235 ///////////////////////////////////////////////////////////////
 236 ////////////////////  calc all iterations /////////////////////
 237 ///////////////////////////////////////////////////////////////
 238 static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
 239               const __global T *dt,
 240               int u_step, int msg_disp_step, int data_disp_step,
 241               float4 cmax_disc_term, float4 cdisc_single_jump)
 242 {
 243     __global T *us = us_ + u_step;
 244     __global T *ds = ds_ - u_step;
 245     __global T *ls = ls_ + 1;
 246     __global T *rs = rs_ - 1;
 247
 248     float4 minimum = (float4)(FLOAT_MAX);
 249
 250     T4 t_dst[CNDISP];
 251     float4 dst_reg;
 252     float4 prev;
 253     float4 cur;
 254
 255     T t_us = us[0];
 256     T t_ds = ds[0];
 257     T t_ls = ls[0];
 258     T t_rs = rs[0];
 259     T t_dt = dt[0];
 260
 261     prev = (float4)(t_us + t_ls + t_rs + t_dt,
 262                     t_ds + t_ls + t_rs + t_dt,
 263                     t_us + t_ds + t_rs + t_dt,
 264                     t_us + t_ds + t_ls + t_dt);
 265
 266     minimum = min(prev, minimum);
 267
 268     t_dst[0] = saturate_cast4(prev);
 269
 270     for(int i = 1, idx = msg_disp_step; i < CNDISP; ++i, idx+=msg_disp_step)
 271     {
 272         t_us = us[idx];
 273         t_ds = ds[idx];
 274         t_ls = ls[idx];
 275         t_rs = rs[idx];
 276         t_dt = dt[data_disp_step * i];
 277
 278         dst_reg = (float4)(t_us + t_ls + t_rs + t_dt,
 279                            t_ds + t_ls + t_rs + t_dt,
 280                            t_us + t_ds + t_rs + t_dt,
 281                            t_us + t_ds + t_ls + t_dt);
 282
 283         minimum = min(dst_reg, minimum);
 284
 285         prev += cdisc_single_jump;
 286         prev = min(prev, dst_reg);
 287
 288         t_dst[i] = saturate_cast4(prev);
 289     }
 290
 291     minimum += cmax_disc_term;
 292
 293     float4 sum = (float4)(0);
 294     prev = convert_float4(t_dst[CNDISP - 1]);
 295     for (int disp = CNDISP - 2; disp >= 0; disp--)
 296     {
 297         prev += cdisc_single_jump;
 298         cur = convert_float4(t_dst[disp]);
 299         prev = min(prev, cur);
 300         cur = min(prev, minimum);
 301         sum += cur;
 302
 303         t_dst[disp] = saturate_cast4(cur);
 304     }
 305
 306     dst_reg = convert_float4(t_dst[CNDISP - 1]);
 307     dst_reg = min(dst_reg, minimum);
 308     t_dst[CNDISP - 1] = saturate_cast4(dst_reg);
 309     sum += dst_reg;
 310
 311     sum /= (float4)(CNDISP);
 312 #pragma unroll
 313     for(int i = 0, idx = 0; i < CNDISP; ++i, idx+=msg_disp_step)
 314     {
 315         T4 dst = t_dst[i];
 316         us_[idx] = dst.x - sum.x;
 317         ds_[idx] = dst.y - sum.y;
 318         rs_[idx] = dst.z - sum.z;
 319         ls_[idx] = dst.w - sum.w;
 320     }
 321 }
 322 __kernel void one_iteration(__global T *u,    int u_step,
 323                             __global T *data, int data_step,
 324                             __global T *d,    __global T *l, __global T *r,
 325                             int t, int cols, int rows,
 326                             float cmax_disc_term, float cdisc_single_jump)
 327 {
 328     const int y = get_global_id(1);
 329     const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
 330
 331     if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
 332     {
 333         u_step    /= sizeof(T);
 334         data_step /= sizeof(T);
 335
 336         __global T *us = u + y * u_step + x;
 337         __global T *ds = d + y * u_step + x;
 338         __global T *ls = l + y * u_step + x;
 339         __global T *rs = r + y * u_step + x;
 340         const __global  T *dt = data + y * data_step + x;
 341
 342         int msg_disp_step = u_step * rows;
 343         int data_disp_step = data_step * rows;
 344
 345         message(us, ds, ls, rs, dt,
 346                 u_step, msg_disp_step, data_disp_step,
 347                 (float4)(cmax_disc_term), (float4)(cdisc_single_jump));
 348     }
 349 }
 350
 351 ///////////////////////////////////////////////////////////////
 352 /////////////////////////// output ////////////////////////////
 353 ///////////////////////////////////////////////////////////////
 354 __kernel void output(const __global T *u, int u_step,
 355                      const __global T *d, const __global T *l,
 356                      const __global T *r, const __global T *data,
 357                      __global T *disp, int disp_rows, int disp_cols, int disp_step,
 358                      int cndisp)
 359 {
 360     const int x = get_global_id(0);
 361     const int y = get_global_id(1);
 362
 363     if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
 364     {
 365         u_step    /= sizeof(T);
 366         disp_step /= sizeof(T);
 367         const __global T *us = u + (y + 1) * u_step + x;
 368         const __global T *ds = d + (y - 1) * u_step + x;
 369         const __global T *ls = l + y * u_step + (x + 1);
 370         const __global T *rs = r + y * u_step + (x - 1);
 371         const __global T *dt = data + y * u_step + x;
 372
 373         int disp_steps = disp_rows * u_step;
 374
 375         int best = 0;
 376         float best_val = FLOAT_MAX;
 377         for (int d = 0; d < cndisp; ++d)
 378         {
 379             float val;
 380             val  = us[d * disp_steps];
 381             val += ds[d * disp_steps];
 382             val += ls[d * disp_steps];
 383             val += rs[d * disp_steps];
 384             val += dt[d * disp_steps];
 385
 386             if (val < best_val)
 387             {
 388                 best_val = val;
 389                 best = d;
 390             }
 391         }
 392
 393         (disp + y * disp_step)[x] = convert_short_sat(best);
 394     }
 395 }