modules/ocl/src/opencl/objdetect_hog.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Wenju He, wenju@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #define CELL_WIDTH 8
  48 #define CELL_HEIGHT 8
  49 #define CELLS_PER_BLOCK_X 2
  50 #define CELLS_PER_BLOCK_Y 2
  51 #define NTHREADS 256
  52 #define CV_PI_F 3.1415926535897932384626433832795f
  53
  54 //----------------------------------------------------------------------------
  55 // Histogram computation
  56 // 12 threads for a cell, 12x4 threads per block
  57 __kernel void compute_hists_kernel(
  58     const int cblock_stride_x, const int cblock_stride_y,
  59     const int cnbins, const int cblock_hist_size, const int img_block_width,
  60     const int blocks_in_group, const int blocks_total,
  61     const int grad_quadstep, const int qangle_step,
  62     __global const float* grad, __global const uchar* qangle,
  63     const float scale, __global float* block_hists, __local float* smem)
  64 {
  65     const int lx = get_local_id(0);
  66     const int lp = lx / 24; /* local group id */
  67     const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
  68     const int gidY = gid / img_block_width;
  69     const int gidX = gid - gidY * img_block_width;
  70
  71     const int lidX = lx - lp * 24;
  72     const int lidY = get_local_id(1);
  73
  74     const int cell_x = lidX / 12;
  75     const int cell_y = lidY;
  76     const int cell_thread_x = lidX - cell_x * 12;
  77
  78     __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
  79         CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
  80     __local float* final_hist = hists + cnbins *
  81         (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
  82
  83     const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
  84     const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
  85
  86     __global const float* grad_ptr = (gid < blocks_total) ?
  87         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
  88     __global const uchar* qangle_ptr = (gid < blocks_total) ?
  89         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
  90
  91     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
  92         cell_thread_x;
  93     for (int bin_id = 0; bin_id < cnbins; ++bin_id)
  94         hist[bin_id * 48] = 0.f;
  95
  96     const int dist_x = -4 + cell_thread_x - 4 * cell_x;
  97     const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
  98
  99     const int dist_y_begin = -4 - 4 * lidY;
 100     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
 101     {
 102         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
 103         uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
 104
 105         grad_ptr += grad_quadstep;
 106         qangle_ptr += qangle_step;
 107
 108         int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
 109
 110         float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
 111             dist_center_x) * scale);
 112         float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
 113             (8.f - fabs(dist_x + 0.5f)) / 64.f;
 114
 115         hist[bin.x * 48] += gaussian * interp_weight * vote.x;
 116         hist[bin.y * 48] += gaussian * interp_weight * vote.y;
 117     }
 118     barrier(CLK_LOCAL_MEM_FENCE);
 119
 120     volatile __local float* hist_ = hist;
 121     for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
 122     {
 123         if (cell_thread_x < 6)
 124             hist_[0] += hist_[6];
 125         barrier(CLK_LOCAL_MEM_FENCE);
 126         if (cell_thread_x < 3)
 127             hist_[0] += hist_[3];
 128 #ifdef WAVE_SIZE_1
 129         barrier(CLK_LOCAL_MEM_FENCE);
 130 #endif
 131         if (cell_thread_x == 0)
 132             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
 133                 hist_[0] + hist_[1] + hist_[2];
 134     }
 135 #ifdef WAVE_SIZE_1
 136     barrier(CLK_LOCAL_MEM_FENCE);
 137 #endif
 138
 139     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
 140     if ((tid < cblock_hist_size) && (gid < blocks_total))
 141     {
 142         __global float* block_hist = block_hists +
 143             (gidY * img_block_width + gidX) * cblock_hist_size;
 144         block_hist[tid] = final_hist[tid];
 145     }
 146 }
 147
 148 //-------------------------------------------------------------
 149 //  Normalization of histograms via L2Hys_norm
 150 //
 151 float reduce_smem(volatile __local float* smem, int size)
 152 {
 153     unsigned int tid = get_local_id(0);
 154     float sum = smem[tid];
 155
 156     if (size >= 512)
 157     {
 158         if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
 159         barrier(CLK_LOCAL_MEM_FENCE);
 160     }
 161     if (size >= 256)
 162     {
 163         if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
 164         barrier(CLK_LOCAL_MEM_FENCE);
 165     }
 166     if (size >= 128)
 167     {
 168         if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
 169         barrier(CLK_LOCAL_MEM_FENCE);
 170     }
 171
 172     if (tid < 32)
 173     {
 174         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
 175 #if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
 176     }
 177     barrier(CLK_LOCAL_MEM_FENCE);
 178     if (tid < 16)
 179     {
 180 #endif
 181         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
 182 #ifdef WAVE_SIZE_1
 183     }
 184     barrier(CLK_LOCAL_MEM_FENCE);
 185     if (tid < 8)
 186     {
 187 #endif
 188         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
 189 #ifdef WAVE_SIZE_1
 190     }
 191     barrier(CLK_LOCAL_MEM_FENCE);
 192     if (tid < 4)
 193     {
 194 #endif
 195         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
 196 #ifdef WAVE_SIZE_1
 197     }
 198     barrier(CLK_LOCAL_MEM_FENCE);
 199     if (tid < 2)
 200     {
 201 #endif
 202         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
 203 #ifdef WAVE_SIZE_1
 204     }
 205     barrier(CLK_LOCAL_MEM_FENCE);
 206     if (tid < 1)
 207     {
 208 #endif
 209         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
 210     }
 211
 212     barrier(CLK_LOCAL_MEM_FENCE);
 213     sum = smem[0];
 214
 215     return sum;
 216 }
 217
 218 __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
 219                                      __global float* block_hists, const float threshold, __local float *squares)
 220 {
 221     const int tid = get_local_id(0);
 222     const int gidX = get_group_id(0);
 223     const int gidY = get_group_id(1);
 224
 225     __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
 226
 227     float elem = 0.f;
 228     if (tid < block_hist_size)
 229         elem = hist[0];
 230
 231     squares[tid] = elem * elem;
 232
 233     barrier(CLK_LOCAL_MEM_FENCE);
 234     float sum = reduce_smem(squares, nthreads);
 235
 236     float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
 237     elem = min(elem * scale, threshold);
 238
 239     barrier(CLK_LOCAL_MEM_FENCE);
 240     squares[tid] = elem * elem;
 241
 242     barrier(CLK_LOCAL_MEM_FENCE);
 243     sum = reduce_smem(squares, nthreads);
 244     scale = 1.0f / (sqrt(sum) + 1e-3f);
 245
 246     if (tid < block_hist_size)
 247         hist[0] = elem * scale;
 248 }
 249
 250 //---------------------------------------------------------------------
 251 //  Linear SVM based classification
 252 //
 253 __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
 254                                     const int img_win_width, const int img_block_width,
 255                                     const int win_block_stride_x, const int win_block_stride_y,
 256                                     __global const float * block_hists, __global const float* coefs,
 257                                     float free_coef, float threshold, __global uchar* labels)
 258 {
 259     const int tid = get_local_id(0);
 260     const int gidX = get_group_id(0);
 261     const int gidY = get_group_id(1);
 262
 263     __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 264
 265     float product = 0.f;
 266     for (int i = tid; i < cdescr_size; i += NTHREADS)
 267     {
 268         int offset_y = i / cdescr_width;
 269         int offset_x = i - offset_y * cdescr_width;
 270         product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
 271     }
 272
 273     __local float products[NTHREADS];
 274
 275     products[tid] = product;
 276
 277     barrier(CLK_LOCAL_MEM_FENCE);
 278
 279     if (tid < 128) products[tid] = product = product + products[tid + 128];
 280     barrier(CLK_LOCAL_MEM_FENCE);
 281
 282     if (tid < 64) products[tid] = product = product + products[tid + 64];
 283     barrier(CLK_LOCAL_MEM_FENCE);
 284
 285     volatile __local float* smem = products;
 286     if (tid < 32)
 287     {
 288         smem[tid] = product = product + smem[tid + 32];
 289 #if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
 290     }
 291     barrier(CLK_LOCAL_MEM_FENCE);
 292     if (tid < 16)
 293     {
 294 #endif
 295         smem[tid] = product = product + smem[tid + 16];
 296 #ifdef WAVE_SIZE_1
 297     }
 298     barrier(CLK_LOCAL_MEM_FENCE);
 299     if (tid < 8)
 300     {
 301 #endif
 302         smem[tid] = product = product + smem[tid + 8];
 303 #ifdef WAVE_SIZE_1
 304     }
 305     barrier(CLK_LOCAL_MEM_FENCE);
 306     if (tid < 4)
 307     {
 308 #endif
 309         smem[tid] = product = product + smem[tid + 4];
 310 #ifdef WAVE_SIZE_1
 311     }
 312     barrier(CLK_LOCAL_MEM_FENCE);
 313     if (tid < 2)
 314     {
 315 #endif
 316         smem[tid] = product = product + smem[tid + 2];
 317 #ifdef WAVE_SIZE_1
 318     }
 319     barrier(CLK_LOCAL_MEM_FENCE);
 320     if (tid < 1)
 321     {
 322 #endif
 323         smem[tid] = product = product + smem[tid + 1];
 324     }
 325
 326     if (tid == 0)
 327         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
 328 }
 329
 330 //----------------------------------------------------------------------------
 331 // Extract descriptors
 332
 333 __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
 334         const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
 335         __global const float* block_hists, __global float* descriptors)
 336 {
 337     int tid = get_local_id(0);
 338     int gidX = get_group_id(0);
 339     int gidY = get_group_id(1);
 340
 341     // Get left top corner of the window in src
 342     __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 343
 344     // Get left top corner of the window in dst
 345     __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 346
 347     // Copy elements from src to dst
 348     for (int i = tid; i < cdescr_size; i += NTHREADS)
 349     {
 350         int offset_y = i / cdescr_width;
 351         int offset_x = i - offset_y * cdescr_width;
 352         descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
 353     }
 354 }
 355
 356 __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
 357         const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
 358         const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
 359 {
 360     int tid = get_local_id(0);
 361     int gidX = get_group_id(0);
 362     int gidY = get_group_id(1);
 363
 364     // Get left top corner of the window in src
 365     __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 366
 367     // Get left top corner of the window in dst
 368     __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 369
 370     // Copy elements from src to dst
 371     for (int i = tid; i < cdescr_size; i += NTHREADS)
 372     {
 373         int block_idx = i / cblock_hist_size;
 374         int idx_in_block = i - block_idx * cblock_hist_size;
 375
 376         int y = block_idx / cnblocks_win_x;
 377         int x = block_idx - y * cnblocks_win_x;
 378
 379         descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
 380     }
 381 }
 382
 383 //----------------------------------------------------------------------------
 384 // Gradients computation
 385
 386 __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
 387         const __global uchar4 * img, __global float * grad, __global uchar * qangle,
 388         const float angle_scale, const char correct_gamma, const int cnbins)
 389 {
 390     const int x = get_global_id(0);
 391     const int tid = get_local_id(0);
 392     const int gSizeX = get_local_size(0);
 393     const int gidX = get_group_id(0);
 394     const int gidY = get_group_id(1);
 395
 396     __global const uchar4* row = img + gidY * img_step;
 397
 398     __local float sh_row[(NTHREADS + 2) * 3];
 399
 400     uchar4 val;
 401     if (x < width)
 402         val = row[x];
 403     else
 404         val = row[width - 2];
 405
 406     sh_row[tid + 1] = val.x;
 407     sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
 408     sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
 409
 410     if (tid == 0)
 411     {
 412         val = row[max(x - 1, 1)];
 413         sh_row[0] = val.x;
 414         sh_row[(NTHREADS + 2)] = val.y;
 415         sh_row[2 * (NTHREADS + 2)] = val.z;
 416     }
 417
 418     if (tid == gSizeX - 1)
 419     {
 420         val = row[min(x + 1, width - 2)];
 421         sh_row[gSizeX + 1] = val.x;
 422         sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
 423         sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
 424     }
 425
 426     barrier(CLK_LOCAL_MEM_FENCE);
 427     if (x < width)
 428     {
 429         float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
 430         float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
 431
 432         float3 dx;
 433         if (correct_gamma == 1)
 434             dx = sqrt(b) - sqrt(a);
 435         else
 436             dx = b - a;
 437
 438         float3 dy = (float3) 0.f;
 439
 440         if (gidY > 0 && gidY < height - 1)
 441         {
 442             a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
 443             b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
 444
 445             if (correct_gamma == 1)
 446                 dy = sqrt(b) - sqrt(a);
 447             else
 448                 dy = b - a;
 449         }
 450
 451         float best_dx = dx.x;
 452         float best_dy = dy.x;
 453
 454         float mag0 = dx.x * dx.x + dy.x * dy.x;
 455         float mag1 = dx.y * dx.y + dy.y * dy.y;
 456         if (mag0 < mag1)
 457         {
 458             best_dx = dx.y;
 459             best_dy = dy.y;
 460             mag0 = mag1;
 461         }
 462
 463         mag1 = dx.z * dx.z + dy.z * dy.z;
 464         if (mag0 < mag1)
 465         {
 466             best_dx = dx.z;
 467             best_dy = dy.z;
 468             mag0 = mag1;
 469         }
 470
 471         mag0 = sqrt(mag0);
 472
 473         float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
 474         int hidx = (int)floor(ang);
 475         ang -= hidx;
 476         hidx = (hidx + cnbins) % cnbins;
 477
 478         qangle[(gidY * qangle_step + x) << 1] = hidx;
 479         qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
 480         grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
 481         grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
 482     }
 483 }
 484
 485 __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
 486         __global const uchar * img, __global float * grad, __global uchar * qangle,
 487         const float angle_scale, const char correct_gamma, const int cnbins)
 488 {
 489     const int x = get_global_id(0);
 490     const int tid = get_local_id(0);
 491     const int gSizeX = get_local_size(0);
 492     const int gidX = get_group_id(0);
 493     const int gidY = get_group_id(1);
 494
 495     __global const uchar* row = img + gidY * img_step;
 496
 497     __local float sh_row[NTHREADS + 2];
 498
 499     if (x < width)
 500         sh_row[tid + 1] = row[x];
 501     else
 502         sh_row[tid + 1] = row[width - 2];
 503
 504     if (tid == 0)
 505         sh_row[0] = row[max(x - 1, 1)];
 506
 507     if (tid == gSizeX - 1)
 508         sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
 509
 510     barrier(CLK_LOCAL_MEM_FENCE);
 511     if (x < width)
 512     {
 513         float dx;
 514
 515         if (correct_gamma == 1)
 516             dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
 517         else
 518             dx = sh_row[tid + 2] - sh_row[tid];
 519
 520         float dy = 0.f;
 521         if (gidY > 0 && gidY < height - 1)
 522         {
 523             float a = (float) img[ (gidY + 1) * img_step + x ];
 524             float b = (float) img[ (gidY - 1) * img_step + x ];
 525             if (correct_gamma == 1)
 526                 dy = sqrt(a) - sqrt(b);
 527             else
 528                 dy = a - b;
 529         }
 530         float mag = sqrt(dx * dx + dy * dy);
 531
 532         float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
 533         int hidx = (int)floor(ang);
 534         ang -= hidx;
 535         hidx = (hidx + cnbins) % cnbins;
 536
 537         qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
 538         qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
 539         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
 540         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
 541     }
 542 }
 543
 544 //----------------------------------------------------------------------------
 545 // Resize
 546
 547 __kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
 548                                  int dst_offset, int src_offset, int dst_step, int src_step,
 549                                  int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 550 {
 551     int dx = get_global_id(0);
 552     int dy = get_global_id(1);
 553
 554     int sx = (int)floor(dx*ifx+0.5f);
 555     int sy = (int)floor(dy*ify+0.5f);
 556     sx = min(sx, src_cols-1);
 557     sy = min(sy, src_rows-1);
 558     int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
 559     int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
 560
 561     if(dx<dst_cols && dy<dst_rows)
 562         dst[dpos] = src[spos];
 563 }
 564
 565 __kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
 566                                  int dst_offset, int src_offset, int dst_step, int src_step,
 567                                  int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 568 {
 569     int dx = get_global_id(0);
 570     int dy = get_global_id(1);
 571
 572     int sx = (int)floor(dx*ifx+0.5f);
 573     int sy = (int)floor(dy*ify+0.5f);
 574     sx = min(sx, src_cols-1);
 575     sy = min(sy, src_rows-1);
 576     int dpos = dst_offset + dy * dst_step + dx;
 577     int spos = src_offset + sy * src_step + sx;
 578
 579     if(dx<dst_cols && dy<dst_rows)
 580         dst[dpos] = src[spos];
 581 }