1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "internal_shared.hpp"
46 #include "opencv2/gpu/device/vec_math.hpp"
48 namespace cv { namespace gpu { namespace device
50 namespace match_template
52 __device__ __forceinline__ float sum(float v) { return v; }
53 __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
54 __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
55 __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
57 __device__ __forceinline__ float first(float v) { return v; }
58 __device__ __forceinline__ float first(float2 v) { return v.x; }
59 __device__ __forceinline__ float first(float3 v) { return v.x; }
60 __device__ __forceinline__ float first(float4 v) { return v.x; }
62 __device__ __forceinline__ float mul(float a, float b) { return a * b; }
63 __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
64 __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
65 __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
67 __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
68 __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
69 __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
70 __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
72 __device__ __forceinline__ float sub(float a, float b) { return a - b; }
73 __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
74 __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
75 __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
77 __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
78 __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
79 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
80 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
82 //////////////////////////////////////////////////////////////////////
85 template <typename T, int cn>
86 __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
88 typedef typename TypeVec<T, cn>::vec_type Type;
89 typedef typename TypeVec<float, cn>::vec_type Typef;
91 int x = blockDim.x * blockIdx.x + threadIdx.x;
92 int y = blockDim.y * blockIdx.y + threadIdx.y;
94 if (x < result.cols && y < result.rows)
96 Typef res = VecTraits<Typef>::all(0);
98 for (int i = 0; i < h; ++i)
100 const Type* image_ptr = (const Type*)image.ptr(y + i);
101 const Type* templ_ptr = (const Type*)templ.ptr(i);
102 for (int j = 0; j < w; ++j)
103 res = res + mul(image_ptr[x + j], templ_ptr[j]);
106 result.ptr(y)[x] = sum(res);
110 template <typename T, int cn>
111 void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
113 const dim3 threads(32, 8);
114 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
116 matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
117 cudaSafeCall( cudaGetLastError() );
120 cudaSafeCall( cudaDeviceSynchronize() );
123 void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
125 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
127 static const caller_t callers[] =
129 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
132 callers[cn](image, templ, result, stream);
136 void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
138 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
140 static const caller_t callers[] =
142 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
145 callers[cn](image, templ, result, stream);
148 //////////////////////////////////////////////////////////////////////
151 template <typename T, int cn>
152 __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
154 typedef typename TypeVec<T, cn>::vec_type Type;
155 typedef typename TypeVec<float, cn>::vec_type Typef;
157 int x = blockDim.x * blockIdx.x + threadIdx.x;
158 int y = blockDim.y * blockIdx.y + threadIdx.y;
160 if (x < result.cols && y < result.rows)
162 Typef res = VecTraits<Typef>::all(0);
165 for (int i = 0; i < h; ++i)
167 const Type* image_ptr = (const Type*)image.ptr(y + i);
168 const Type* templ_ptr = (const Type*)templ.ptr(i);
169 for (int j = 0; j < w; ++j)
171 delta = sub(image_ptr[x + j], templ_ptr[j]);
172 res = res + delta * delta;
176 result.ptr(y)[x] = sum(res);
180 template <typename T, int cn>
181 void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
183 const dim3 threads(32, 8);
184 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
186 matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
187 cudaSafeCall( cudaGetLastError() );
190 cudaSafeCall( cudaDeviceSynchronize() );
193 void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
195 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
197 static const caller_t callers[] =
199 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
202 callers[cn](image, templ, result, stream);
205 void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
207 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
209 static const caller_t callers[] =
211 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
214 callers[cn](image, templ, result, stream);
217 //////////////////////////////////////////////////////////////////////
221 __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
223 const int x = blockIdx.x * blockDim.x + threadIdx.x;
224 const int y = blockIdx.y * blockDim.y + threadIdx.y;
226 if (x < result.cols && y < result.rows)
228 float image_sqsum_ = (float)(
229 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
230 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
231 float ccorr = result.ptr(y)[x];
232 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
237 void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
239 const dim3 threads(32, 8);
240 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
242 matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
243 cudaSafeCall( cudaGetLastError() );
246 cudaSafeCall( cudaDeviceSynchronize() );
249 void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
252 typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
254 static const caller_t callers[] =
256 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
259 callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
262 //////////////////////////////////////////////////////////////////////
263 // Prepared_SQDIFF_NORMED
265 // normAcc* are accurate normalization routines which make GPU matchTemplate
266 // consistent with CPU one
268 __device__ float normAcc(float num, float denum)
270 if (::fabs(num) < denum)
272 if (::fabs(num) < denum * 1.125f)
273 return num > 0 ? 1 : -1;
278 __device__ float normAcc_SQDIFF(float num, float denum)
280 if (::fabs(num) < denum)
282 if (::fabs(num) < denum * 1.125f)
283 return num > 0 ? 1 : -1;
289 __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
290 int w, int h, const PtrStep<unsigned long long> image_sqsum,
291 unsigned long long templ_sqsum, PtrStepSzf result)
293 const int x = blockIdx.x * blockDim.x + threadIdx.x;
294 const int y = blockIdx.y * blockDim.y + threadIdx.y;
296 if (x < result.cols && y < result.rows)
298 float image_sqsum_ = (float)(
299 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
300 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
301 float ccorr = result.ptr(y)[x];
302 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
303 sqrtf(image_sqsum_ * templ_sqsum));
308 void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
309 PtrStepSzf result, cudaStream_t stream)
311 const dim3 threads(32, 8);
312 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
314 matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
315 cudaSafeCall( cudaGetLastError() );
318 cudaSafeCall( cudaDeviceSynchronize() );
322 void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
323 PtrStepSzf result, int cn, cudaStream_t stream)
325 typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
326 static const caller_t callers[] =
328 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
331 callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
334 //////////////////////////////////////////////////////////////////////
337 __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
339 const int x = blockIdx.x * blockDim.x + threadIdx.x;
340 const int y = blockIdx.y * blockDim.y + threadIdx.y;
342 if (x < result.cols && y < result.rows)
344 float image_sum_ = (float)(
345 (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
346 (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
347 float ccorr = result.ptr(y)[x];
348 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
352 void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
355 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
357 matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
358 cudaSafeCall( cudaGetLastError() );
361 cudaSafeCall( cudaDeviceSynchronize() );
366 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
367 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
368 const PtrStep<unsigned int> image_sum_r,
369 const PtrStep<unsigned int> image_sum_g,
372 const int x = blockIdx.x * blockDim.x + threadIdx.x;
373 const int y = blockIdx.y * blockDim.y + threadIdx.y;
375 if (x < result.cols && y < result.rows)
377 float image_sum_r_ = (float)(
378 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
379 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
380 float image_sum_g_ = (float)(
381 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
382 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
383 float ccorr = result.ptr(y)[x];
384 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
385 - image_sum_g_ * templ_sum_scale_g;
389 void matchTemplatePrepared_CCOFF_8UC2(
391 const PtrStepSz<unsigned int> image_sum_r,
392 const PtrStepSz<unsigned int> image_sum_g,
393 unsigned int templ_sum_r, unsigned int templ_sum_g,
394 PtrStepSzf result, cudaStream_t stream)
397 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
399 matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
400 w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
401 image_sum_r, image_sum_g, result);
402 cudaSafeCall( cudaGetLastError() );
405 cudaSafeCall( cudaDeviceSynchronize() );
410 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
412 float templ_sum_scale_r,
413 float templ_sum_scale_g,
414 float templ_sum_scale_b,
415 const PtrStep<unsigned int> image_sum_r,
416 const PtrStep<unsigned int> image_sum_g,
417 const PtrStep<unsigned int> image_sum_b,
420 const int x = blockIdx.x * blockDim.x + threadIdx.x;
421 const int y = blockIdx.y * blockDim.y + threadIdx.y;
423 if (x < result.cols && y < result.rows)
425 float image_sum_r_ = (float)(
426 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
427 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
428 float image_sum_g_ = (float)(
429 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
430 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
431 float image_sum_b_ = (float)(
432 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
433 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
434 float ccorr = result.ptr(y)[x];
435 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
436 - image_sum_g_ * templ_sum_scale_g
437 - image_sum_b_ * templ_sum_scale_b;
441 void matchTemplatePrepared_CCOFF_8UC3(
443 const PtrStepSz<unsigned int> image_sum_r,
444 const PtrStepSz<unsigned int> image_sum_g,
445 const PtrStepSz<unsigned int> image_sum_b,
446 unsigned int templ_sum_r,
447 unsigned int templ_sum_g,
448 unsigned int templ_sum_b,
449 PtrStepSzf result, cudaStream_t stream)
452 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
454 matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
456 (float)templ_sum_r / (w * h),
457 (float)templ_sum_g / (w * h),
458 (float)templ_sum_b / (w * h),
459 image_sum_r, image_sum_g, image_sum_b, result);
460 cudaSafeCall( cudaGetLastError() );
463 cudaSafeCall( cudaDeviceSynchronize() );
468 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
470 float templ_sum_scale_r,
471 float templ_sum_scale_g,
472 float templ_sum_scale_b,
473 float templ_sum_scale_a,
474 const PtrStep<unsigned int> image_sum_r,
475 const PtrStep<unsigned int> image_sum_g,
476 const PtrStep<unsigned int> image_sum_b,
477 const PtrStep<unsigned int> image_sum_a,
480 const int x = blockIdx.x * blockDim.x + threadIdx.x;
481 const int y = blockIdx.y * blockDim.y + threadIdx.y;
483 if (x < result.cols && y < result.rows)
485 float image_sum_r_ = (float)(
486 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
487 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
488 float image_sum_g_ = (float)(
489 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
490 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
491 float image_sum_b_ = (float)(
492 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
493 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
494 float image_sum_a_ = (float)(
495 (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
496 (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
497 float ccorr = result.ptr(y)[x];
498 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
499 - image_sum_g_ * templ_sum_scale_g
500 - image_sum_b_ * templ_sum_scale_b
501 - image_sum_a_ * templ_sum_scale_a;
505 void matchTemplatePrepared_CCOFF_8UC4(
507 const PtrStepSz<unsigned int> image_sum_r,
508 const PtrStepSz<unsigned int> image_sum_g,
509 const PtrStepSz<unsigned int> image_sum_b,
510 const PtrStepSz<unsigned int> image_sum_a,
511 unsigned int templ_sum_r,
512 unsigned int templ_sum_g,
513 unsigned int templ_sum_b,
514 unsigned int templ_sum_a,
515 PtrStepSzf result, cudaStream_t stream)
518 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
520 matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
522 (float)templ_sum_r / (w * h),
523 (float)templ_sum_g / (w * h),
524 (float)templ_sum_b / (w * h),
525 (float)templ_sum_a / (w * h),
526 image_sum_r, image_sum_g, image_sum_b, image_sum_a,
528 cudaSafeCall( cudaGetLastError() );
531 cudaSafeCall( cudaDeviceSynchronize() );
534 //////////////////////////////////////////////////////////////////////
535 // Prepared_CCOFF_NORMED
537 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
538 int w, int h, float weight,
539 float templ_sum_scale, float templ_sqsum_scale,
540 const PtrStep<unsigned int> image_sum,
541 const PtrStep<unsigned long long> image_sqsum,
544 const int x = blockIdx.x * blockDim.x + threadIdx.x;
545 const int y = blockIdx.y * blockDim.y + threadIdx.y;
547 if (x < result.cols && y < result.rows)
549 float ccorr = result.ptr(y)[x];
550 float image_sum_ = (float)(
551 (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
552 (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
553 float image_sqsum_ = (float)(
554 (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
555 (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
556 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
557 sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
561 void matchTemplatePrepared_CCOFF_NORMED_8U(
562 int w, int h, const PtrStepSz<unsigned int> image_sum,
563 const PtrStepSz<unsigned long long> image_sqsum,
564 unsigned int templ_sum, unsigned long long templ_sqsum,
565 PtrStepSzf result, cudaStream_t stream)
568 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
570 float weight = 1.f / (w * h);
571 float templ_sum_scale = templ_sum * weight;
572 float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
574 matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
575 w, h, weight, templ_sum_scale, templ_sqsum_scale,
576 image_sum, image_sqsum, result);
577 cudaSafeCall( cudaGetLastError() );
580 cudaSafeCall( cudaDeviceSynchronize() );
585 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
586 int w, int h, float weight,
587 float templ_sum_scale_r, float templ_sum_scale_g,
588 float templ_sqsum_scale,
589 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
590 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
593 const int x = blockIdx.x * blockDim.x + threadIdx.x;
594 const int y = blockIdx.y * blockDim.y + threadIdx.y;
596 if (x < result.cols && y < result.rows)
598 float image_sum_r_ = (float)(
599 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
600 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
601 float image_sqsum_r_ = (float)(
602 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
603 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
604 float image_sum_g_ = (float)(
605 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
606 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
607 float image_sqsum_g_ = (float)(
608 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
609 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
611 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
612 - image_sum_g_ * templ_sum_scale_g;
613 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
614 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
615 result.ptr(y)[x] = normAcc(num, denum);
619 void matchTemplatePrepared_CCOFF_NORMED_8UC2(
621 const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
622 const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
623 unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
624 unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
625 PtrStepSzf result, cudaStream_t stream)
628 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
630 float weight = 1.f / (w * h);
631 float templ_sum_scale_r = templ_sum_r * weight;
632 float templ_sum_scale_g = templ_sum_g * weight;
633 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
634 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
636 matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
638 templ_sum_scale_r, templ_sum_scale_g,
640 image_sum_r, image_sqsum_r,
641 image_sum_g, image_sqsum_g,
643 cudaSafeCall( cudaGetLastError() );
646 cudaSafeCall( cudaDeviceSynchronize() );
651 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
652 int w, int h, float weight,
653 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
654 float templ_sqsum_scale,
655 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
656 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
657 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
660 const int x = blockIdx.x * blockDim.x + threadIdx.x;
661 const int y = blockIdx.y * blockDim.y + threadIdx.y;
663 if (x < result.cols && y < result.rows)
665 float image_sum_r_ = (float)(
666 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
667 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
668 float image_sqsum_r_ = (float)(
669 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
670 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
671 float image_sum_g_ = (float)(
672 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
673 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
674 float image_sqsum_g_ = (float)(
675 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
676 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
677 float image_sum_b_ = (float)(
678 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
679 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
680 float image_sqsum_b_ = (float)(
681 (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
682 (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
684 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
685 - image_sum_g_ * templ_sum_scale_g
686 - image_sum_b_ * templ_sum_scale_b;
687 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
688 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
689 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
690 result.ptr(y)[x] = normAcc(num, denum);
694 void matchTemplatePrepared_CCOFF_NORMED_8UC3(
696 const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
697 const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
698 const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
699 unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
700 unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
701 unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
702 PtrStepSzf result, cudaStream_t stream)
705 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
707 float weight = 1.f / (w * h);
708 float templ_sum_scale_r = templ_sum_r * weight;
709 float templ_sum_scale_g = templ_sum_g * weight;
710 float templ_sum_scale_b = templ_sum_b * weight;
711 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
712 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
713 + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
715 matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
717 templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
719 image_sum_r, image_sqsum_r,
720 image_sum_g, image_sqsum_g,
721 image_sum_b, image_sqsum_b,
723 cudaSafeCall( cudaGetLastError() );
726 cudaSafeCall( cudaDeviceSynchronize() );
731 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
732 int w, int h, float weight,
733 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
734 float templ_sum_scale_a, float templ_sqsum_scale,
735 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
736 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
737 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
738 const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
741 const int x = blockIdx.x * blockDim.x + threadIdx.x;
742 const int y = blockIdx.y * blockDim.y + threadIdx.y;
744 if (x < result.cols && y < result.rows)
746 float image_sum_r_ = (float)(
747 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
748 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
749 float image_sqsum_r_ = (float)(
750 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
751 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
752 float image_sum_g_ = (float)(
753 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
754 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
755 float image_sqsum_g_ = (float)(
756 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
757 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
758 float image_sum_b_ = (float)(
759 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
760 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
761 float image_sqsum_b_ = (float)(
762 (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
763 (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
764 float image_sum_a_ = (float)(
765 (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
766 (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
767 float image_sqsum_a_ = (float)(
768 (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
769 (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
771 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
772 - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
773 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
774 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
775 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
776 + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
777 result.ptr(y)[x] = normAcc(num, denum);
781 void matchTemplatePrepared_CCOFF_NORMED_8UC4(
783 const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
784 const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
785 const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
786 const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
787 unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
788 unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
789 unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
790 unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
791 PtrStepSzf result, cudaStream_t stream)
794 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
796 float weight = 1.f / (w * h);
797 float templ_sum_scale_r = templ_sum_r * weight;
798 float templ_sum_scale_g = templ_sum_g * weight;
799 float templ_sum_scale_b = templ_sum_b * weight;
800 float templ_sum_scale_a = templ_sum_a * weight;
801 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
802 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
803 + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
804 + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
806 matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
808 templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
810 image_sum_r, image_sqsum_r,
811 image_sum_g, image_sqsum_g,
812 image_sum_b, image_sqsum_b,
813 image_sum_a, image_sqsum_a,
815 cudaSafeCall( cudaGetLastError() );
818 cudaSafeCall( cudaDeviceSynchronize() );
821 //////////////////////////////////////////////////////////////////////
825 __global__ void normalizeKernel_8U(
826 int w, int h, const PtrStep<unsigned long long> image_sqsum,
827 unsigned long long templ_sqsum, PtrStepSzf result)
829 const int x = blockIdx.x * blockDim.x + threadIdx.x;
830 const int y = blockIdx.y * blockDim.y + threadIdx.y;
832 if (x < result.cols && y < result.rows)
834 float image_sqsum_ = (float)(
835 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
836 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
837 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
841 void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
842 unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
845 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
850 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
853 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
856 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
859 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
863 cudaSafeCall( cudaGetLastError() );
866 cudaSafeCall( cudaDeviceSynchronize() );
869 //////////////////////////////////////////////////////////////////////
870 // extractFirstChannel
873 __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
875 typedef typename TypeVec<float, cn>::vec_type Typef;
877 int x = blockDim.x * blockIdx.x + threadIdx.x;
878 int y = blockDim.y * blockIdx.y + threadIdx.y;
880 if (x < result.cols && y < result.rows)
882 Typef val = ((const Typef*)image.ptr(y))[x];
883 result.ptr(y)[x] = first(val);
887 void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
890 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
895 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
898 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
901 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
904 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
907 cudaSafeCall( cudaGetLastError() );
910 cudaSafeCall( cudaDeviceSynchronize() );
912 } //namespace match_template
913 }}} // namespace cv { namespace gpu { namespace device
916 #endif /* CUDA_DISABLER */