modules/gpu/src/cuda/match_template.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "internal_shared.hpp"
  46 #include "opencv2/gpu/device/vec_math.hpp"
  47
  48 namespace cv { namespace gpu { namespace device
  49 {
  50     namespace match_template
  51     {
  52         __device__ __forceinline__ float sum(float v) { return v; }
  53         __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
  54         __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
  55         __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
  56
  57         __device__ __forceinline__ float first(float v) { return v; }
  58         __device__ __forceinline__ float first(float2 v) { return v.x; }
  59         __device__ __forceinline__ float first(float3 v) { return v.x; }
  60         __device__ __forceinline__ float first(float4 v) { return v.x; }
  61
  62         __device__ __forceinline__ float mul(float a, float b) { return a * b; }
  63         __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
  64         __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
  65         __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
  66
  67         __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
  68         __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
  69         __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
  70         __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
  71
  72         __device__ __forceinline__ float sub(float a, float b) { return a - b; }
  73         __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
  74         __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
  75         __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
  76
  77         __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
  78         __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
  79         __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
  80         __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
  81
  82         //////////////////////////////////////////////////////////////////////
  83         // Naive_CCORR
  84
  85         template <typename T, int cn>
  86         __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
  87         {
  88             typedef typename TypeVec<T, cn>::vec_type Type;
  89             typedef typename TypeVec<float, cn>::vec_type Typef;
  90
  91             int x = blockDim.x * blockIdx.x + threadIdx.x;
  92             int y = blockDim.y * blockIdx.y + threadIdx.y;
  93
  94             if (x < result.cols && y < result.rows)
  95             {
  96                 Typef res = VecTraits<Typef>::all(0);
  97
  98                 for (int i = 0; i < h; ++i)
  99                 {
 100                     const Type* image_ptr = (const Type*)image.ptr(y + i);
 101                     const Type* templ_ptr = (const Type*)templ.ptr(i);
 102                     for (int j = 0; j < w; ++j)
 103                         res = res + mul(image_ptr[x + j], templ_ptr[j]);
 104                 }
 105
 106                 result.ptr(y)[x] = sum(res);
 107             }
 108         }
 109
 110         template <typename T, int cn>
 111         void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
 112         {
 113             const dim3 threads(32, 8);
 114             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 115
 116             matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
 117             cudaSafeCall( cudaGetLastError() );
 118
 119             if (stream == 0)
 120                 cudaSafeCall( cudaDeviceSynchronize() );
 121         }
 122
 123         void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
 124         {
 125             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
 126
 127             static const caller_t callers[] =
 128             {
 129                 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
 130             };
 131
 132             callers[cn](image, templ, result, stream);
 133         }
 134
 135
 136         void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
 137         {
 138             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
 139
 140             static const caller_t callers[] =
 141             {
 142                 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
 143             };
 144
 145             callers[cn](image, templ, result, stream);
 146         }
 147
 148         //////////////////////////////////////////////////////////////////////
 149         // Naive_SQDIFF
 150
 151         template <typename T, int cn>
 152         __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
 153         {
 154             typedef typename TypeVec<T, cn>::vec_type Type;
 155             typedef typename TypeVec<float, cn>::vec_type Typef;
 156
 157             int x = blockDim.x * blockIdx.x + threadIdx.x;
 158             int y = blockDim.y * blockIdx.y + threadIdx.y;
 159
 160             if (x < result.cols && y < result.rows)
 161             {
 162                 Typef res = VecTraits<Typef>::all(0);
 163                 Typef delta;
 164
 165                 for (int i = 0; i < h; ++i)
 166                 {
 167                     const Type* image_ptr = (const Type*)image.ptr(y + i);
 168                     const Type* templ_ptr = (const Type*)templ.ptr(i);
 169                     for (int j = 0; j < w; ++j)
 170                     {
 171                         delta = sub(image_ptr[x + j], templ_ptr[j]);
 172                         res = res + delta * delta;
 173                     }
 174                 }
 175
 176                 result.ptr(y)[x] = sum(res);
 177             }
 178         }
 179
 180         template <typename T, int cn>
 181         void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
 182         {
 183             const dim3 threads(32, 8);
 184             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 185
 186             matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
 187             cudaSafeCall( cudaGetLastError() );
 188
 189             if (stream == 0)
 190                 cudaSafeCall( cudaDeviceSynchronize() );
 191         }
 192
 193         void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
 194         {
 195             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
 196
 197             static const caller_t callers[] =
 198             {
 199                 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
 200             };
 201
 202             callers[cn](image, templ, result, stream);
 203         }
 204
 205         void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
 206         {
 207             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
 208
 209             static const caller_t callers[] =
 210             {
 211                 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
 212             };
 213
 214             callers[cn](image, templ, result, stream);
 215         }
 216
 217         //////////////////////////////////////////////////////////////////////
 218         // Prepared_SQDIFF
 219
 220         template <int cn>
 221         __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
 222         {
 223             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 224             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 225
 226             if (x < result.cols && y < result.rows)
 227             {
 228                 float image_sqsum_ = (float)(
 229                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
 230                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
 231                 float ccorr = result.ptr(y)[x];
 232                 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
 233             }
 234         }
 235
 236         template <int cn>
 237         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
 238         {
 239             const dim3 threads(32, 8);
 240             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 241
 242             matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 243             cudaSafeCall( cudaGetLastError() );
 244
 245             if (stream == 0)
 246                 cudaSafeCall( cudaDeviceSynchronize() );
 247         }
 248
 249         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
 250                                              cudaStream_t stream)
 251         {
 252             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
 253
 254             static const caller_t callers[] =
 255             {
 256                 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
 257             };
 258
 259             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 260         }
 261
 262         //////////////////////////////////////////////////////////////////////
 263         // Prepared_SQDIFF_NORMED
 264
 265         // normAcc* are accurate normalization routines which make GPU matchTemplate
 266         // consistent with CPU one
 267
 268         __device__ float normAcc(float num, float denum)
 269         {
 270             if (::fabs(num) < denum)
 271                 return num / denum;
 272             if (::fabs(num) < denum * 1.125f)
 273                 return num > 0 ? 1 : -1;
 274             return 0;
 275         }
 276
 277
 278         __device__ float normAcc_SQDIFF(float num, float denum)
 279         {
 280             if (::fabs(num) < denum)
 281                 return num / denum;
 282             if (::fabs(num) < denum * 1.125f)
 283                 return num > 0 ? 1 : -1;
 284             return 1;
 285         }
 286
 287
 288         template <int cn>
 289         __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
 290                 int w, int h, const PtrStep<unsigned long long> image_sqsum,
 291                 unsigned long long templ_sqsum, PtrStepSzf result)
 292         {
 293             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 294             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 295
 296             if (x < result.cols && y < result.rows)
 297             {
 298                 float image_sqsum_ = (float)(
 299                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
 300                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
 301                 float ccorr = result.ptr(y)[x];
 302                 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
 303                                                   sqrtf(image_sqsum_ * templ_sqsum));
 304             }
 305         }
 306
 307         template <int cn>
 308         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
 309                                                     PtrStepSzf result, cudaStream_t stream)
 310         {
 311             const dim3 threads(32, 8);
 312             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 313
 314             matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 315             cudaSafeCall( cudaGetLastError() );
 316
 317             if (stream == 0)
 318                 cudaSafeCall( cudaDeviceSynchronize() );
 319         }
 320
 321
 322         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
 323                                                     PtrStepSzf result, int cn, cudaStream_t stream)
 324         {
 325             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
 326             static const caller_t callers[] =
 327             {
 328                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
 329             };
 330
 331             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
 332         }
 333
 334         //////////////////////////////////////////////////////////////////////
 335         // Prepared_CCOFF
 336
 337         __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
 338         {
 339             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 340             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 341
 342             if (x < result.cols && y < result.rows)
 343             {
 344                 float image_sum_ = (float)(
 345                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
 346                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
 347                 float ccorr = result.ptr(y)[x];
 348                 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
 349             }
 350         }
 351
 352         void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
 353         {
 354             dim3 threads(32, 8);
 355             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 356
 357             matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
 358             cudaSafeCall( cudaGetLastError() );
 359
 360             if (stream == 0)
 361                 cudaSafeCall( cudaDeviceSynchronize() );
 362         }
 363
 364
 365
 366         __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
 367                 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
 368                 const PtrStep<unsigned int> image_sum_r,
 369                 const PtrStep<unsigned int> image_sum_g,
 370                 PtrStepSzf result)
 371         {
 372             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 373             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 374
 375             if (x < result.cols && y < result.rows)
 376             {
 377                 float image_sum_r_ = (float)(
 378                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 379                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 380                 float image_sum_g_ = (float)(
 381                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 382                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 383                 float ccorr = result.ptr(y)[x];
 384                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
 385                                          - image_sum_g_ * templ_sum_scale_g;
 386             }
 387         }
 388
 389         void matchTemplatePrepared_CCOFF_8UC2(
 390                 int w, int h,
 391                 const PtrStepSz<unsigned int> image_sum_r,
 392                 const PtrStepSz<unsigned int> image_sum_g,
 393                 unsigned int templ_sum_r, unsigned int templ_sum_g,
 394                 PtrStepSzf result, cudaStream_t stream)
 395         {
 396             dim3 threads(32, 8);
 397             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 398
 399             matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
 400                     w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
 401                     image_sum_r, image_sum_g, result);
 402             cudaSafeCall( cudaGetLastError() );
 403
 404             if (stream == 0)
 405                 cudaSafeCall( cudaDeviceSynchronize() );
 406         }
 407
 408
 409
 410         __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
 411                 int w, int h,
 412                 float templ_sum_scale_r,
 413                 float templ_sum_scale_g,
 414                 float templ_sum_scale_b,
 415                 const PtrStep<unsigned int> image_sum_r,
 416                 const PtrStep<unsigned int> image_sum_g,
 417                 const PtrStep<unsigned int> image_sum_b,
 418                 PtrStepSzf result)
 419         {
 420             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 421             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 422
 423             if (x < result.cols && y < result.rows)
 424             {
 425                 float image_sum_r_ = (float)(
 426                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 427                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 428                 float image_sum_g_ = (float)(
 429                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 430                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 431                 float image_sum_b_ = (float)(
 432                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
 433                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
 434                 float ccorr = result.ptr(y)[x];
 435                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
 436                                          - image_sum_g_ * templ_sum_scale_g
 437                                          - image_sum_b_ * templ_sum_scale_b;
 438             }
 439         }
 440
 441         void matchTemplatePrepared_CCOFF_8UC3(
 442                 int w, int h,
 443                 const PtrStepSz<unsigned int> image_sum_r,
 444                 const PtrStepSz<unsigned int> image_sum_g,
 445                 const PtrStepSz<unsigned int> image_sum_b,
 446                 unsigned int templ_sum_r,
 447                 unsigned int templ_sum_g,
 448                 unsigned int templ_sum_b,
 449                 PtrStepSzf result, cudaStream_t stream)
 450         {
 451             dim3 threads(32, 8);
 452             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 453
 454             matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
 455                     w, h,
 456                     (float)templ_sum_r / (w * h),
 457                     (float)templ_sum_g / (w * h),
 458                     (float)templ_sum_b / (w * h),
 459                     image_sum_r, image_sum_g, image_sum_b, result);
 460             cudaSafeCall( cudaGetLastError() );
 461
 462             if (stream == 0)
 463                 cudaSafeCall( cudaDeviceSynchronize() );
 464         }
 465
 466
 467
 468         __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
 469                 int w, int h,
 470                 float templ_sum_scale_r,
 471                 float templ_sum_scale_g,
 472                 float templ_sum_scale_b,
 473                 float templ_sum_scale_a,
 474                 const PtrStep<unsigned int> image_sum_r,
 475                 const PtrStep<unsigned int> image_sum_g,
 476                 const PtrStep<unsigned int> image_sum_b,
 477                 const PtrStep<unsigned int> image_sum_a,
 478                 PtrStepSzf result)
 479         {
 480             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 481             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 482
 483             if (x < result.cols && y < result.rows)
 484             {
 485                 float image_sum_r_ = (float)(
 486                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 487                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 488                 float image_sum_g_ = (float)(
 489                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 490                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 491                 float image_sum_b_ = (float)(
 492                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
 493                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
 494                 float image_sum_a_ = (float)(
 495                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
 496                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
 497                 float ccorr = result.ptr(y)[x];
 498                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
 499                                          - image_sum_g_ * templ_sum_scale_g
 500                                          - image_sum_b_ * templ_sum_scale_b
 501                                          - image_sum_a_ * templ_sum_scale_a;
 502             }
 503         }
 504
 505         void matchTemplatePrepared_CCOFF_8UC4(
 506                 int w, int h,
 507                 const PtrStepSz<unsigned int> image_sum_r,
 508                 const PtrStepSz<unsigned int> image_sum_g,
 509                 const PtrStepSz<unsigned int> image_sum_b,
 510                 const PtrStepSz<unsigned int> image_sum_a,
 511                 unsigned int templ_sum_r,
 512                 unsigned int templ_sum_g,
 513                 unsigned int templ_sum_b,
 514                 unsigned int templ_sum_a,
 515                 PtrStepSzf result, cudaStream_t stream)
 516         {
 517             dim3 threads(32, 8);
 518             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 519
 520             matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
 521                     w, h,
 522                     (float)templ_sum_r / (w * h),
 523                     (float)templ_sum_g / (w * h),
 524                     (float)templ_sum_b / (w * h),
 525                     (float)templ_sum_a / (w * h),
 526                     image_sum_r, image_sum_g, image_sum_b, image_sum_a,
 527                     result);
 528             cudaSafeCall( cudaGetLastError() );
 529
 530             if (stream == 0)
 531                 cudaSafeCall( cudaDeviceSynchronize() );
 532         }
 533
 534         //////////////////////////////////////////////////////////////////////
 535         // Prepared_CCOFF_NORMED
 536
 537         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
 538                 int w, int h, float weight,
 539                 float templ_sum_scale, float templ_sqsum_scale,
 540                 const PtrStep<unsigned int> image_sum,
 541                 const PtrStep<unsigned long long> image_sqsum,
 542                 PtrStepSzf result)
 543         {
 544             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 545             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 546
 547             if (x < result.cols && y < result.rows)
 548             {
 549                 float ccorr = result.ptr(y)[x];
 550                 float image_sum_ = (float)(
 551                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
 552                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
 553                 float image_sqsum_ = (float)(
 554                         (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
 555                         (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
 556                 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
 557                                            sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
 558             }
 559         }
 560
 561         void matchTemplatePrepared_CCOFF_NORMED_8U(
 562                     int w, int h, const PtrStepSz<unsigned int> image_sum,
 563                     const PtrStepSz<unsigned long long> image_sqsum,
 564                     unsigned int templ_sum, unsigned long long templ_sqsum,
 565                     PtrStepSzf result, cudaStream_t stream)
 566         {
 567             dim3 threads(32, 8);
 568             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 569
 570             float weight = 1.f / (w * h);
 571             float templ_sum_scale = templ_sum * weight;
 572             float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
 573
 574             matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
 575                     w, h, weight, templ_sum_scale, templ_sqsum_scale,
 576                     image_sum, image_sqsum, result);
 577             cudaSafeCall( cudaGetLastError() );
 578
 579             if (stream == 0)
 580                 cudaSafeCall( cudaDeviceSynchronize() );
 581         }
 582
 583
 584
 585         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
 586                 int w, int h, float weight,
 587                 float templ_sum_scale_r, float templ_sum_scale_g,
 588                 float templ_sqsum_scale,
 589                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
 590                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
 591                 PtrStepSzf result)
 592         {
 593             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 594             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 595
 596             if (x < result.cols && y < result.rows)
 597             {
 598                 float image_sum_r_ = (float)(
 599                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 600                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 601                 float image_sqsum_r_ = (float)(
 602                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
 603                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
 604                 float image_sum_g_ = (float)(
 605                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 606                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 607                 float image_sqsum_g_ = (float)(
 608                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
 609                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
 610
 611                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
 612                                              - image_sum_g_ * templ_sum_scale_g;
 613                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
 614                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
 615                 result.ptr(y)[x] = normAcc(num, denum);
 616             }
 617         }
 618
 619         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
 620                     int w, int h,
 621                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
 622                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
 623                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
 624                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
 625                     PtrStepSzf result, cudaStream_t stream)
 626         {
 627             dim3 threads(32, 8);
 628             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 629
 630             float weight = 1.f / (w * h);
 631             float templ_sum_scale_r = templ_sum_r * weight;
 632             float templ_sum_scale_g = templ_sum_g * weight;
 633             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
 634                                        + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
 635
 636             matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
 637                     w, h, weight,
 638                     templ_sum_scale_r, templ_sum_scale_g,
 639                     templ_sqsum_scale,
 640                     image_sum_r, image_sqsum_r,
 641                     image_sum_g, image_sqsum_g,
 642                     result);
 643             cudaSafeCall( cudaGetLastError() );
 644
 645             if (stream == 0)
 646                 cudaSafeCall( cudaDeviceSynchronize() );
 647         }
 648
 649
 650
 651         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
 652                 int w, int h, float weight,
 653                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
 654                 float templ_sqsum_scale,
 655                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
 656                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
 657                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
 658                 PtrStepSzf result)
 659         {
 660             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 661             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 662
 663             if (x < result.cols && y < result.rows)
 664             {
 665                 float image_sum_r_ = (float)(
 666                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 667                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 668                 float image_sqsum_r_ = (float)(
 669                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
 670                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
 671                 float image_sum_g_ = (float)(
 672                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 673                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 674                 float image_sqsum_g_ = (float)(
 675                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
 676                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
 677                 float image_sum_b_ = (float)(
 678                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
 679                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
 680                 float image_sqsum_b_ = (float)(
 681                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
 682                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
 683
 684                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
 685                                              - image_sum_g_ * templ_sum_scale_g
 686                                              - image_sum_b_ * templ_sum_scale_b;
 687                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
 688                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
 689                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
 690                 result.ptr(y)[x] = normAcc(num, denum);
 691             }
 692         }
 693
 694         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
 695                     int w, int h,
 696                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
 697                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
 698                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
 699                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
 700                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
 701                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
 702                     PtrStepSzf result, cudaStream_t stream)
 703         {
 704             dim3 threads(32, 8);
 705             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 706
 707             float weight = 1.f / (w * h);
 708             float templ_sum_scale_r = templ_sum_r * weight;
 709             float templ_sum_scale_g = templ_sum_g * weight;
 710             float templ_sum_scale_b = templ_sum_b * weight;
 711             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
 712                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
 713                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
 714
 715             matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
 716                     w, h, weight,
 717                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
 718                     templ_sqsum_scale,
 719                     image_sum_r, image_sqsum_r,
 720                     image_sum_g, image_sqsum_g,
 721                     image_sum_b, image_sqsum_b,
 722                     result);
 723             cudaSafeCall( cudaGetLastError() );
 724
 725             if (stream == 0)
 726                 cudaSafeCall( cudaDeviceSynchronize() );
 727         }
 728
 729
 730
 731         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
 732                 int w, int h, float weight,
 733                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
 734                 float templ_sum_scale_a, float templ_sqsum_scale,
 735                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
 736                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
 737                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
 738                 const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
 739                 PtrStepSzf result)
 740         {
 741             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 742             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 743
 744             if (x < result.cols && y < result.rows)
 745             {
 746                 float image_sum_r_ = (float)(
 747                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
 748                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
 749                 float image_sqsum_r_ = (float)(
 750                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
 751                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
 752                 float image_sum_g_ = (float)(
 753                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
 754                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
 755                 float image_sqsum_g_ = (float)(
 756                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
 757                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
 758                 float image_sum_b_ = (float)(
 759                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
 760                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
 761                 float image_sqsum_b_ = (float)(
 762                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
 763                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
 764                 float image_sum_a_ = (float)(
 765                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
 766                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
 767                 float image_sqsum_a_ = (float)(
 768                         (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
 769                         (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
 770
 771                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
 772                                              - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
 773                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
 774                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
 775                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
 776                                                          + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
 777                 result.ptr(y)[x] = normAcc(num, denum);
 778             }
 779         }
 780
 781         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
 782                     int w, int h,
 783                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
 784                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
 785                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
 786                     const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
 787                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
 788                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
 789                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
 790                     unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
 791                     PtrStepSzf result, cudaStream_t stream)
 792         {
 793             dim3 threads(32, 8);
 794             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 795
 796             float weight = 1.f / (w * h);
 797             float templ_sum_scale_r = templ_sum_r * weight;
 798             float templ_sum_scale_g = templ_sum_g * weight;
 799             float templ_sum_scale_b = templ_sum_b * weight;
 800             float templ_sum_scale_a = templ_sum_a * weight;
 801             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
 802                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
 803                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
 804                                       + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
 805
 806             matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
 807                     w, h, weight,
 808                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
 809                     templ_sqsum_scale,
 810                     image_sum_r, image_sqsum_r,
 811                     image_sum_g, image_sqsum_g,
 812                     image_sum_b, image_sqsum_b,
 813                     image_sum_a, image_sqsum_a,
 814                     result);
 815             cudaSafeCall( cudaGetLastError() );
 816
 817             if (stream == 0)
 818                 cudaSafeCall( cudaDeviceSynchronize() );
 819         }
 820
 821         //////////////////////////////////////////////////////////////////////
 822         // normalize
 823
 824         template <int cn>
 825         __global__ void normalizeKernel_8U(
 826                 int w, int h, const PtrStep<unsigned long long> image_sqsum,
 827                 unsigned long long templ_sqsum, PtrStepSzf result)
 828         {
 829             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 830             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 831
 832             if (x < result.cols && y < result.rows)
 833             {
 834                 float image_sqsum_ = (float)(
 835                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
 836                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
 837                 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
 838             }
 839         }
 840
 841         void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
 842                           unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
 843         {
 844             dim3 threads(32, 8);
 845             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 846
 847             switch (cn)
 848             {
 849             case 1:
 850                 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 851                 break;
 852             case 2:
 853                 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 854                 break;
 855             case 3:
 856                 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 857                 break;
 858             case 4:
 859                 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
 860                 break;
 861             }
 862
 863             cudaSafeCall( cudaGetLastError() );
 864
 865             if (stream == 0)
 866                 cudaSafeCall( cudaDeviceSynchronize() );
 867         }
 868
 869         //////////////////////////////////////////////////////////////////////
 870         // extractFirstChannel
 871
 872         template <int cn>
 873         __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
 874         {
 875             typedef typename TypeVec<float, cn>::vec_type Typef;
 876
 877             int x = blockDim.x * blockIdx.x + threadIdx.x;
 878             int y = blockDim.y * blockIdx.y + threadIdx.y;
 879
 880             if (x < result.cols && y < result.rows)
 881             {
 882                 Typef val = ((const Typef*)image.ptr(y))[x];
 883                 result.ptr(y)[x] = first(val);
 884             }
 885         }
 886
 887         void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
 888         {
 889             dim3 threads(32, 8);
 890             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 891
 892             switch (cn)
 893             {
 894             case 1:
 895                 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
 896                 break;
 897             case 2:
 898                 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
 899                 break;
 900             case 3:
 901                 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
 902                 break;
 903             case 4:
 904                 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
 905                 break;
 906             }
 907             cudaSafeCall( cudaGetLastError() );
 908
 909             if (stream == 0)
 910                 cudaSafeCall( cudaDeviceSynchronize() );
 911         }
 912     } //namespace match_template
 913 }}} // namespace cv { namespace gpu { namespace device
 914
 915
 916 #endif /* CUDA_DISABLER */