modules/ocl/src/opencl/optical_flow_farneback.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Sen Liu, swjtuls1987@126.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #define tx  get_local_id(0)
  48 #define ty  get_local_id(1)
  49 #define bx  get_group_id(0)
  50 #define bdx get_local_size(0)
  51
  52 #define BORDER_SIZE 5
  53 #define MAX_KSIZE_HALF 100
  54
  55 #ifndef polyN
  56 #define polyN 5
  57 #endif
  58
  59 __kernel void polynomialExpansion(__global float * dst,
  60                                   __global __const float * src,
  61                                   __global __const float * c_g,
  62                                   __global __const float * c_xg,
  63                                   __global __const float * c_xxg,
  64                                   __local float * smem,
  65                                   const float4 ig,
  66                                   const int height, const int width,
  67                                   int dstStep, int srcStep)
  68 {
  69     const int y = get_global_id(1);
  70     const int x = bx * (bdx - 2*polyN) + tx - polyN;
  71
  72     dstStep /= sizeof(*dst);
  73     srcStep /= sizeof(*src);
  74
  75     int xWarped;
  76     __local float *row = smem + tx;
  77
  78     if (y < height && y >= 0)
  79     {
  80         xWarped = min(max(x, 0), width - 1);
  81
  82         row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];
  83         row[bdx] = 0.f;
  84         row[2*bdx] = 0.f;
  85
  86 #pragma unroll
  87         for (int k = 1; k <= polyN; ++k)
  88         {
  89             float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];
  90             float t1 = src[mad24(min(y + k, height - 1), srcStep, xWarped)];
  91
  92             row[0] += c_g[k] * (t0 + t1);
  93             row[bdx] += c_xg[k] * (t1 - t0);
  94             row[2*bdx] += c_xxg[k] * (t0 + t1);
  95         }
  96     }
  97
  98     barrier(CLK_LOCAL_MEM_FENCE);
  99
 100     if (y < height && y >= 0 && tx >= polyN && tx + polyN < bdx && x < width)
 101     {
 102         float b1 = c_g[0] * row[0];
 103         float b3 = c_g[0] * row[bdx];
 104         float b5 = c_g[0] * row[2*bdx];
 105         float b2 = 0, b4 = 0, b6 = 0;
 106
 107 #pragma unroll
 108         for (int k = 1; k <= polyN; ++k)
 109         {
 110             b1 += (row[k] + row[-k]) * c_g[k];
 111             b4 += (row[k] + row[-k]) * c_xxg[k];
 112             b2 += (row[k] - row[-k]) * c_xg[k];
 113             b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
 114             b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
 115             b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
 116         }
 117
 118         dst[mad24(y, dstStep, xWarped)] = b3*ig.s0;
 119         dst[mad24(height + y, dstStep, xWarped)] = b2*ig.s0;
 120         dst[mad24(2*height + y, dstStep, xWarped)] = b1*ig.s1 + b5*ig.s2;
 121         dst[mad24(3*height + y, dstStep, xWarped)] = b1*ig.s1 + b4*ig.s2;
 122         dst[mad24(4*height + y, dstStep, xWarped)] = b6*ig.s3;
 123     }
 124 }
 125
 126 inline int idx_row_low(const int y, const int last_row)
 127 {
 128     return abs(y) % (last_row + 1);
 129 }
 130
 131 inline int idx_row_high(const int y, const int last_row)
 132 {
 133     return abs(last_row - abs(last_row - y)) % (last_row + 1);
 134 }
 135
 136 inline int idx_row(const int y, const int last_row)
 137 {
 138     return idx_row_low(idx_row_high(y, last_row), last_row);
 139 }
 140
 141 inline int idx_col_low(const int x, const int last_col)
 142 {
 143     return abs(x) % (last_col + 1);
 144 }
 145
 146 inline int idx_col_high(const int x, const int last_col)
 147 {
 148     return abs(last_col - abs(last_col - x)) % (last_col + 1);
 149 }
 150
 151 inline int idx_col(const int x, const int last_col)
 152 {
 153     return idx_col_low(idx_col_high(x, last_col), last_col);
 154 }
 155
 156 __kernel void gaussianBlur(__global float * dst,
 157                            __global const float * src,
 158                            __global const float * c_gKer,
 159                            __local float * smem,
 160                            const int height,  const int width,
 161                            int dstStep, int srcStep,
 162                            const int ksizeHalf)
 163 {
 164     const int y = get_global_id(1);
 165     const int x = get_global_id(0);
 166
 167     dstStep /= sizeof(*dst);
 168     srcStep /= sizeof(*src);
 169
 170     __local float *row = smem + ty * (bdx + 2*ksizeHalf);
 171
 172     if (y < height)
 173     {
 174         // Vertical pass
 175         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 176         {
 177             int xExt = (int)(bx * bdx) + i - ksizeHalf;
 178             xExt = idx_col(xExt, width - 1);
 179             row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];
 180             for (int j = 1; j <= ksizeHalf; ++j)
 181                 row[i] += (src[mad24(idx_row_low(y - j, height - 1), srcStep, xExt)]
 182                            + src[mad24(idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
 183         }
 184     }
 185
 186     barrier(CLK_LOCAL_MEM_FENCE);
 187
 188     if (y < height && y >= 0 && x < width && x >= 0)
 189     {
 190         // Horizontal pass
 191         row += tx + ksizeHalf;
 192         float res = row[0] * c_gKer[0];
 193         for (int i = 1; i <= ksizeHalf; ++i)
 194             res += (row[-i] + row[i]) * c_gKer[i];
 195
 196         dst[mad24(y, dstStep, x)] = res;
 197     }
 198 }
 199
 200 __constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };
 201
 202 __kernel void updateMatrices(__global float * M,
 203                              __global const float * flowx, __global const float * flowy,
 204                              __global const float * R0, __global const float * R1,
 205                              const int height, const int width,
 206                              int mStep, int xStep,  int yStep, int R0Step, int R1Step)
 207 {
 208     const int y = get_global_id(1);
 209     const int x = get_global_id(0);
 210
 211     mStep /= sizeof(*M);
 212     xStep /= sizeof(*flowx);
 213     yStep /= sizeof(*flowy);
 214     R0Step /= sizeof(*R0);
 215     R1Step /= sizeof(*R1);
 216
 217     if (y < height && y >= 0 && x < width && x >= 0)
 218     {
 219         float dx = flowx[mad24(y, xStep, x)];
 220         float dy = flowy[mad24(y, yStep, x)];
 221         float fx = x + dx;
 222         float fy = y + dy;
 223
 224         int x1 = convert_int(floor(fx));
 225         int y1 = convert_int(floor(fy));
 226         fx -= x1;
 227         fy -= y1;
 228
 229         float r2, r3, r4, r5, r6;
 230
 231         if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
 232         {
 233             float a00 = (1.f - fx) * (1.f - fy);
 234             float a01 = fx * (1.f - fy);
 235             float a10 = (1.f - fx) * fy;
 236             float a11 = fx * fy;
 237
 238             r2 = a00 * R1[mad24(y1, R1Step, x1)] +
 239                  a01 * R1[mad24(y1, R1Step, x1 + 1)] +
 240                  a10 * R1[mad24(y1 + 1, R1Step, x1)] +
 241                  a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];
 242
 243             r3 = a00 * R1[mad24(height + y1, R1Step, x1)] +
 244                  a01 * R1[mad24(height + y1, R1Step, x1 + 1)] +
 245                  a10 * R1[mad24(height + y1 + 1, R1Step, x1)] +
 246                  a11 * R1[mad24(height + y1 + 1, R1Step, x1 + 1)];
 247
 248             r4 = a00 * R1[mad24(2*height + y1, R1Step, x1)] +
 249                  a01 * R1[mad24(2*height + y1, R1Step, x1 + 1)] +
 250                  a10 * R1[mad24(2*height + y1 + 1, R1Step, x1)] +
 251                  a11 * R1[mad24(2*height + y1 + 1, R1Step, x1 + 1)];
 252
 253             r5 = a00 * R1[mad24(3*height + y1, R1Step, x1)] +
 254                  a01 * R1[mad24(3*height + y1, R1Step, x1 + 1)] +
 255                  a10 * R1[mad24(3*height + y1 + 1, R1Step, x1)] +
 256                  a11 * R1[mad24(3*height + y1 + 1, R1Step, x1 + 1)];
 257
 258             r6 = a00 * R1[mad24(4*height + y1, R1Step, x1)] +
 259                  a01 * R1[mad24(4*height + y1, R1Step, x1 + 1)] +
 260                  a10 * R1[mad24(4*height + y1 + 1, R1Step, x1)] +
 261                  a11 * R1[mad24(4*height + y1 + 1, R1Step, x1 + 1)];
 262
 263             r4 = (R0[mad24(2*height + y, R0Step, x)] + r4) * 0.5f;
 264             r5 = (R0[mad24(3*height + y, R0Step, x)] + r5) * 0.5f;
 265             r6 = (R0[mad24(4*height + y, R0Step, x)] + r6) * 0.25f;
 266         }
 267         else
 268         {
 269             r2 = r3 = 0.f;
 270             r4 = R0[mad24(2*height + y, R0Step, x)];
 271             r5 = R0[mad24(3*height + y, R0Step, x)];
 272             r6 = R0[mad24(4*height + y, R0Step, x)] * 0.5f;
 273         }
 274
 275         r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;
 276         r3 = (R0[mad24(height + y, R0Step, x)] - r3) * 0.5f;
 277
 278         r2 += r4*dy + r6*dx;
 279         r3 += r6*dy + r5*dx;
 280
 281         float scale =
 282             c_border[min(x, BORDER_SIZE)] *
 283             c_border[min(y, BORDER_SIZE)] *
 284             c_border[min(width - x - 1, BORDER_SIZE)] *
 285             c_border[min(height - y - 1, BORDER_SIZE)];
 286
 287         r2 *= scale;
 288         r3 *= scale;
 289         r4 *= scale;
 290         r5 *= scale;
 291         r6 *= scale;
 292
 293         M[mad24(y, mStep, x)] = r4*r4 + r6*r6;
 294         M[mad24(height + y, mStep, x)] = (r4 + r5)*r6;
 295         M[mad24(2*height + y, mStep, x)] = r5*r5 + r6*r6;
 296         M[mad24(3*height + y, mStep, x)] = r4*r2 + r6*r3;
 297         M[mad24(4*height + y, mStep, x)] = r6*r2 + r5*r3;
 298     }
 299 }
 300
 301 __kernel void boxFilter5(__global float * dst,
 302                          __global const float * src,
 303                          __local float * smem,
 304                          const int height,  const int width,
 305                          int dstStep, int srcStep,
 306                          const int ksizeHalf)
 307 {
 308     const int y = get_global_id(1);
 309     const int x = get_global_id(0);
 310
 311     const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
 312     const int smw = bdx + 2*ksizeHalf; // shared memory "width"
 313     __local float *row = smem + 5 * ty * smw;
 314
 315     dstStep /= sizeof(*dst);
 316     srcStep /= sizeof(*src);
 317
 318     if (y < height)
 319     {
 320         // Vertical pass
 321         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 322         {
 323             int xExt = (int)(bx * bdx) + i - ksizeHalf;
 324             xExt = min(max(xExt, 0), width - 1);
 325
 326 #pragma unroll
 327             for (int k = 0; k < 5; ++k)
 328                 row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)];
 329
 330             for (int j = 1; j <= ksizeHalf; ++j)
 331 #pragma unroll
 332                 for (int k = 0; k < 5; ++k)
 333                     row[k*smw + i] +=
 334                         src[mad24(k*height + max(y - j, 0), srcStep, xExt)] +
 335                         src[mad24(k*height + min(y + j, height - 1), srcStep, xExt)];
 336         }
 337     }
 338
 339     barrier(CLK_LOCAL_MEM_FENCE);
 340
 341     if (y < height && y >= 0 && x < width && x >= 0)
 342     {
 343         // Horizontal pass
 344
 345         row += tx + ksizeHalf;
 346         float res[5];
 347
 348 #pragma unroll
 349         for (int k = 0; k < 5; ++k)
 350             res[k] = row[k*smw];
 351
 352         for (int i = 1; i <= ksizeHalf; ++i)
 353 #pragma unroll
 354             for (int k = 0; k < 5; ++k)
 355                 res[k] += row[k*smw - i] + row[k*smw + i];
 356
 357 #pragma unroll
 358         for (int k = 0; k < 5; ++k)
 359             dst[mad24(k*height + y, dstStep, x)] = res[k] * boxAreaInv;
 360     }
 361 }
 362
 363 __kernel void updateFlow(__global float4 * flowx, __global float4 * flowy,
 364                          __global const float4 * M,
 365                          const int height, const int width,
 366                          int xStep, int yStep, int mStep)
 367 {
 368     const int y = get_global_id(1);
 369     const int x = get_global_id(0);
 370
 371     xStep /= sizeof(*flowx);
 372     yStep /= sizeof(*flowy);
 373     mStep /= sizeof(*M);
 374
 375     if (y < height && y >= 0 && x < width && x >= 0)
 376     {
 377         float4 g11 = M[mad24(y, mStep, x)];
 378         float4 g12 = M[mad24(height + y, mStep, x)];
 379         float4 g22 = M[mad24(2*height + y, mStep, x)];
 380         float4 h1 =  M[mad24(3*height + y, mStep, x)];
 381         float4 h2 =  M[mad24(4*height + y, mStep, x)];
 382
 383         float4 detInv = (float4)(1.f) / (g11*g22 - g12*g12 + (float4)(1e-3f));
 384
 385         flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;
 386         flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;
 387     }
 388 }
 389
 390 __kernel void gaussianBlur5(__global float * dst,
 391                             __global const float * src,
 392                             __global const float * c_gKer,
 393                             __local float * smem,
 394                             const int height,  const int width,
 395                             int dstStep, int srcStep,
 396                             const int ksizeHalf)
 397 {
 398     const int y = get_global_id(1);
 399     const int x = get_global_id(0);
 400
 401     const int smw = bdx + 2*ksizeHalf; // shared memory "width"
 402     __local volatile float *row = smem + 5 * ty * smw;
 403
 404     dstStep /= sizeof(*dst);
 405     srcStep /= sizeof(*src);
 406
 407     if (y < height)
 408     {
 409         // Vertical pass
 410         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 411         {
 412             int xExt = (int)(bx * bdx) + i - ksizeHalf;
 413             xExt = idx_col(xExt, width - 1);
 414
 415 #pragma unroll
 416             for (int k = 0; k < 5; ++k)
 417                 row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)] * c_gKer[0];
 418
 419             for (int j = 1; j <= ksizeHalf; ++j)
 420 #pragma unroll
 421                 for (int k = 0; k < 5; ++k)
 422                     row[k*smw + i] +=
 423                         (src[mad24(k*height + idx_row_low(y - j, height - 1), srcStep, xExt)] +
 424                          src[mad24(k*height + idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
 425         }
 426     }
 427
 428     barrier(CLK_LOCAL_MEM_FENCE);
 429
 430     if (y < height && y >= 0 && x < width && x >= 0)
 431     {
 432         // Horizontal pass
 433
 434         row += tx + ksizeHalf;
 435         float res[5];
 436
 437 #pragma unroll
 438         for (int k = 0; k < 5; ++k)
 439             res[k] = row[k*smw] * c_gKer[0];
 440
 441         for (int i = 1; i <= ksizeHalf; ++i)
 442 #pragma unroll
 443             for (int k = 0; k < 5; ++k)
 444                 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
 445
 446 #pragma unroll
 447         for (int k = 0; k < 5; ++k)
 448             dst[mad24(k*height + y, dstStep, x)] = res[k];
 449     }
 450 }