added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / match_template.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include "internal_shared.hpp"
46 #include "opencv2/gpu/device/vec_math.hpp"
47
48 namespace cv { namespace gpu { namespace device
49 {
50     namespace match_template
51     {
52         __device__ __forceinline__ float sum(float v) { return v; }
53         __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
54         __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
55         __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
56
57         __device__ __forceinline__ float first(float v) { return v; }
58         __device__ __forceinline__ float first(float2 v) { return v.x; }
59         __device__ __forceinline__ float first(float3 v) { return v.x; }
60         __device__ __forceinline__ float first(float4 v) { return v.x; }
61
62         __device__ __forceinline__ float mul(float a, float b) { return a * b; }
63         __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
64         __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
65         __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66
67         __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
68         __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
69         __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
70         __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
71
72         __device__ __forceinline__ float sub(float a, float b) { return a - b; }
73         __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
74         __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
75         __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
76
77         __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
78         __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
79         __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
80         __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
81
82         //////////////////////////////////////////////////////////////////////
83         // Naive_CCORR
84
85         template <typename T, int cn>
86         __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
87         {
88             typedef typename TypeVec<T, cn>::vec_type Type;
89             typedef typename TypeVec<float, cn>::vec_type Typef;
90
91             int x = blockDim.x * blockIdx.x + threadIdx.x;
92             int y = blockDim.y * blockIdx.y + threadIdx.y;
93
94             if (x < result.cols && y < result.rows)
95             {
96                 Typef res = VecTraits<Typef>::all(0);
97
98                 for (int i = 0; i < h; ++i)
99                 {
100                     const Type* image_ptr = (const Type*)image.ptr(y + i);
101                     const Type* templ_ptr = (const Type*)templ.ptr(i);
102                     for (int j = 0; j < w; ++j)
103                         res = res + mul(image_ptr[x + j], templ_ptr[j]);
104                 }
105
106                 result.ptr(y)[x] = sum(res);
107             }
108         }
109
110         template <typename T, int cn>
111         void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
112         {
113             const dim3 threads(32, 8);
114             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
115
116             matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
117             cudaSafeCall( cudaGetLastError() );
118
119             if (stream == 0)
120                 cudaSafeCall( cudaDeviceSynchronize() );
121         }
122
123         void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
124         {
125             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
126
127             static const caller_t callers[] =
128             {
129                 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
130             };
131
132             callers[cn](image, templ, result, stream);
133         }
134
135
136         void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
137         {
138             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
139
140             static const caller_t callers[] =
141             {
142                 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
143             };
144
145             callers[cn](image, templ, result, stream);
146         }
147
148         //////////////////////////////////////////////////////////////////////
149         // Naive_SQDIFF
150
151         template <typename T, int cn>
152         __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
153         {
154             typedef typename TypeVec<T, cn>::vec_type Type;
155             typedef typename TypeVec<float, cn>::vec_type Typef;
156
157             int x = blockDim.x * blockIdx.x + threadIdx.x;
158             int y = blockDim.y * blockIdx.y + threadIdx.y;
159
160             if (x < result.cols && y < result.rows)
161             {
162                 Typef res = VecTraits<Typef>::all(0);
163                 Typef delta;
164
165                 for (int i = 0; i < h; ++i)
166                 {
167                     const Type* image_ptr = (const Type*)image.ptr(y + i);
168                     const Type* templ_ptr = (const Type*)templ.ptr(i);
169                     for (int j = 0; j < w; ++j)
170                     {
171                         delta = sub(image_ptr[x + j], templ_ptr[j]);
172                         res = res + delta * delta;
173                     }
174                 }
175
176                 result.ptr(y)[x] = sum(res);
177             }
178         }
179
180         template <typename T, int cn>
181         void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
182         {
183             const dim3 threads(32, 8);
184             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
185
186             matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
187             cudaSafeCall( cudaGetLastError() );
188
189             if (stream == 0)
190                 cudaSafeCall( cudaDeviceSynchronize() );
191         }
192
193         void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
194         {
195             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
196
197             static const caller_t callers[] =
198             {
199                 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
200             };
201
202             callers[cn](image, templ, result, stream);
203         }
204
205         void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
206         {
207             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
208
209             static const caller_t callers[] =
210             {
211                 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
212             };
213
214             callers[cn](image, templ, result, stream);
215         }
216
217         //////////////////////////////////////////////////////////////////////
218         // Prepared_SQDIFF
219
220         template <int cn>
221         __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
222         {
223             const int x = blockIdx.x * blockDim.x + threadIdx.x;
224             const int y = blockIdx.y * blockDim.y + threadIdx.y;
225
226             if (x < result.cols && y < result.rows)
227             {
228                 float image_sqsum_ = (float)(
229                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
230                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
231                 float ccorr = result.ptr(y)[x];
232                 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
233             }
234         }
235
236         template <int cn>
237         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
238         {
239             const dim3 threads(32, 8);
240             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
241
242             matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
243             cudaSafeCall( cudaGetLastError() );
244
245             if (stream == 0)
246                 cudaSafeCall( cudaDeviceSynchronize() );
247         }
248
249         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
250                                              cudaStream_t stream)
251         {
252             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
253
254             static const caller_t callers[] =
255             {
256                 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
257             };
258
259             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
260         }
261
262         //////////////////////////////////////////////////////////////////////
263         // Prepared_SQDIFF_NORMED
264
265         // normAcc* are accurate normalization routines which make GPU matchTemplate
266         // consistent with CPU one
267
268         __device__ float normAcc(float num, float denum)
269         {
270             if (::fabs(num) < denum)
271                 return num / denum;
272             if (::fabs(num) < denum * 1.125f)
273                 return num > 0 ? 1 : -1;
274             return 0;
275         }
276
277
278         __device__ float normAcc_SQDIFF(float num, float denum)
279         {
280             if (::fabs(num) < denum)
281                 return num / denum;
282             if (::fabs(num) < denum * 1.125f)
283                 return num > 0 ? 1 : -1;
284             return 1;
285         }
286
287
288         template <int cn>
289         __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
290                 int w, int h, const PtrStep<unsigned long long> image_sqsum,
291                 unsigned long long templ_sqsum, PtrStepSzf result)
292         {
293             const int x = blockIdx.x * blockDim.x + threadIdx.x;
294             const int y = blockIdx.y * blockDim.y + threadIdx.y;
295
296             if (x < result.cols && y < result.rows)
297             {
298                 float image_sqsum_ = (float)(
299                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
300                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
301                 float ccorr = result.ptr(y)[x];
302                 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
303                                                   sqrtf(image_sqsum_ * templ_sqsum));
304             }
305         }
306
307         template <int cn>
308         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
309                                                     PtrStepSzf result, cudaStream_t stream)
310         {
311             const dim3 threads(32, 8);
312             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
313
314             matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
315             cudaSafeCall( cudaGetLastError() );
316
317             if (stream == 0)
318                 cudaSafeCall( cudaDeviceSynchronize() );
319         }
320
321
322         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
323                                                     PtrStepSzf result, int cn, cudaStream_t stream)
324         {
325             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
326             static const caller_t callers[] =
327             {
328                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
329             };
330
331             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
332         }
333
334         //////////////////////////////////////////////////////////////////////
335         // Prepared_CCOFF
336
337         __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
338         {
339             const int x = blockIdx.x * blockDim.x + threadIdx.x;
340             const int y = blockIdx.y * blockDim.y + threadIdx.y;
341
342             if (x < result.cols && y < result.rows)
343             {
344                 float image_sum_ = (float)(
345                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
346                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
347                 float ccorr = result.ptr(y)[x];
348                 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
349             }
350         }
351
352         void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
353         {
354             dim3 threads(32, 8);
355             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
356
357             matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
358             cudaSafeCall( cudaGetLastError() );
359
360             if (stream == 0)
361                 cudaSafeCall( cudaDeviceSynchronize() );
362         }
363
364
365
366         __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
367                 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
368                 const PtrStep<unsigned int> image_sum_r,
369                 const PtrStep<unsigned int> image_sum_g,
370                 PtrStepSzf result)
371         {
372             const int x = blockIdx.x * blockDim.x + threadIdx.x;
373             const int y = blockIdx.y * blockDim.y + threadIdx.y;
374
375             if (x < result.cols && y < result.rows)
376             {
377                 float image_sum_r_ = (float)(
378                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
379                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
380                 float image_sum_g_ = (float)(
381                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
382                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
383                 float ccorr = result.ptr(y)[x];
384                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
385                                          - image_sum_g_ * templ_sum_scale_g;
386             }
387         }
388
389         void matchTemplatePrepared_CCOFF_8UC2(
390                 int w, int h,
391                 const PtrStepSz<unsigned int> image_sum_r,
392                 const PtrStepSz<unsigned int> image_sum_g,
393                 unsigned int templ_sum_r, unsigned int templ_sum_g,
394                 PtrStepSzf result, cudaStream_t stream)
395         {
396             dim3 threads(32, 8);
397             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
398
399             matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
400                     w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
401                     image_sum_r, image_sum_g, result);
402             cudaSafeCall( cudaGetLastError() );
403
404             if (stream == 0)
405                 cudaSafeCall( cudaDeviceSynchronize() );
406         }
407
408
409
410         __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
411                 int w, int h,
412                 float templ_sum_scale_r,
413                 float templ_sum_scale_g,
414                 float templ_sum_scale_b,
415                 const PtrStep<unsigned int> image_sum_r,
416                 const PtrStep<unsigned int> image_sum_g,
417                 const PtrStep<unsigned int> image_sum_b,
418                 PtrStepSzf result)
419         {
420             const int x = blockIdx.x * blockDim.x + threadIdx.x;
421             const int y = blockIdx.y * blockDim.y + threadIdx.y;
422
423             if (x < result.cols && y < result.rows)
424             {
425                 float image_sum_r_ = (float)(
426                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
427                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
428                 float image_sum_g_ = (float)(
429                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
430                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
431                 float image_sum_b_ = (float)(
432                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
433                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
434                 float ccorr = result.ptr(y)[x];
435                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
436                                          - image_sum_g_ * templ_sum_scale_g
437                                          - image_sum_b_ * templ_sum_scale_b;
438             }
439         }
440
441         void matchTemplatePrepared_CCOFF_8UC3(
442                 int w, int h,
443                 const PtrStepSz<unsigned int> image_sum_r,
444                 const PtrStepSz<unsigned int> image_sum_g,
445                 const PtrStepSz<unsigned int> image_sum_b,
446                 unsigned int templ_sum_r,
447                 unsigned int templ_sum_g,
448                 unsigned int templ_sum_b,
449                 PtrStepSzf result, cudaStream_t stream)
450         {
451             dim3 threads(32, 8);
452             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
453
454             matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
455                     w, h,
456                     (float)templ_sum_r / (w * h),
457                     (float)templ_sum_g / (w * h),
458                     (float)templ_sum_b / (w * h),
459                     image_sum_r, image_sum_g, image_sum_b, result);
460             cudaSafeCall( cudaGetLastError() );
461
462             if (stream == 0)
463                 cudaSafeCall( cudaDeviceSynchronize() );
464         }
465
466
467
468         __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
469                 int w, int h,
470                 float templ_sum_scale_r,
471                 float templ_sum_scale_g,
472                 float templ_sum_scale_b,
473                 float templ_sum_scale_a,
474                 const PtrStep<unsigned int> image_sum_r,
475                 const PtrStep<unsigned int> image_sum_g,
476                 const PtrStep<unsigned int> image_sum_b,
477                 const PtrStep<unsigned int> image_sum_a,
478                 PtrStepSzf result)
479         {
480             const int x = blockIdx.x * blockDim.x + threadIdx.x;
481             const int y = blockIdx.y * blockDim.y + threadIdx.y;
482
483             if (x < result.cols && y < result.rows)
484             {
485                 float image_sum_r_ = (float)(
486                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
487                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
488                 float image_sum_g_ = (float)(
489                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
490                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
491                 float image_sum_b_ = (float)(
492                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
493                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
494                 float image_sum_a_ = (float)(
495                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
496                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
497                 float ccorr = result.ptr(y)[x];
498                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
499                                          - image_sum_g_ * templ_sum_scale_g
500                                          - image_sum_b_ * templ_sum_scale_b
501                                          - image_sum_a_ * templ_sum_scale_a;
502             }
503         }
504
505         void matchTemplatePrepared_CCOFF_8UC4(
506                 int w, int h,
507                 const PtrStepSz<unsigned int> image_sum_r,
508                 const PtrStepSz<unsigned int> image_sum_g,
509                 const PtrStepSz<unsigned int> image_sum_b,
510                 const PtrStepSz<unsigned int> image_sum_a,
511                 unsigned int templ_sum_r,
512                 unsigned int templ_sum_g,
513                 unsigned int templ_sum_b,
514                 unsigned int templ_sum_a,
515                 PtrStepSzf result, cudaStream_t stream)
516         {
517             dim3 threads(32, 8);
518             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
519
520             matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
521                     w, h,
522                     (float)templ_sum_r / (w * h),
523                     (float)templ_sum_g / (w * h),
524                     (float)templ_sum_b / (w * h),
525                     (float)templ_sum_a / (w * h),
526                     image_sum_r, image_sum_g, image_sum_b, image_sum_a,
527                     result);
528             cudaSafeCall( cudaGetLastError() );
529
530             if (stream == 0)
531                 cudaSafeCall( cudaDeviceSynchronize() );
532         }
533
534         //////////////////////////////////////////////////////////////////////
535         // Prepared_CCOFF_NORMED
536
537         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
538                 int w, int h, float weight,
539                 float templ_sum_scale, float templ_sqsum_scale,
540                 const PtrStep<unsigned int> image_sum,
541                 const PtrStep<unsigned long long> image_sqsum,
542                 PtrStepSzf result)
543         {
544             const int x = blockIdx.x * blockDim.x + threadIdx.x;
545             const int y = blockIdx.y * blockDim.y + threadIdx.y;
546
547             if (x < result.cols && y < result.rows)
548             {
549                 float ccorr = result.ptr(y)[x];
550                 float image_sum_ = (float)(
551                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
552                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
553                 float image_sqsum_ = (float)(
554                         (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
555                         (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
556                 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
557                                            sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
558             }
559         }
560
561         void matchTemplatePrepared_CCOFF_NORMED_8U(
562                     int w, int h, const PtrStepSz<unsigned int> image_sum,
563                     const PtrStepSz<unsigned long long> image_sqsum,
564                     unsigned int templ_sum, unsigned long long templ_sqsum,
565                     PtrStepSzf result, cudaStream_t stream)
566         {
567             dim3 threads(32, 8);
568             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
569
570             float weight = 1.f / (w * h);
571             float templ_sum_scale = templ_sum * weight;
572             float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
573
574             matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
575                     w, h, weight, templ_sum_scale, templ_sqsum_scale,
576                     image_sum, image_sqsum, result);
577             cudaSafeCall( cudaGetLastError() );
578
579             if (stream == 0)
580                 cudaSafeCall( cudaDeviceSynchronize() );
581         }
582
583
584
585         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
586                 int w, int h, float weight,
587                 float templ_sum_scale_r, float templ_sum_scale_g,
588                 float templ_sqsum_scale,
589                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
590                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
591                 PtrStepSzf result)
592         {
593             const int x = blockIdx.x * blockDim.x + threadIdx.x;
594             const int y = blockIdx.y * blockDim.y + threadIdx.y;
595
596             if (x < result.cols && y < result.rows)
597             {
598                 float image_sum_r_ = (float)(
599                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
600                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
601                 float image_sqsum_r_ = (float)(
602                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
603                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
604                 float image_sum_g_ = (float)(
605                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
606                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
607                 float image_sqsum_g_ = (float)(
608                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
609                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
610
611                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
612                                              - image_sum_g_ * templ_sum_scale_g;
613                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
614                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
615                 result.ptr(y)[x] = normAcc(num, denum);
616             }
617         }
618
619         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
620                     int w, int h,
621                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
622                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
623                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
624                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
625                     PtrStepSzf result, cudaStream_t stream)
626         {
627             dim3 threads(32, 8);
628             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
629
630             float weight = 1.f / (w * h);
631             float templ_sum_scale_r = templ_sum_r * weight;
632             float templ_sum_scale_g = templ_sum_g * weight;
633             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
634                                        + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
635
636             matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
637                     w, h, weight,
638                     templ_sum_scale_r, templ_sum_scale_g,
639                     templ_sqsum_scale,
640                     image_sum_r, image_sqsum_r,
641                     image_sum_g, image_sqsum_g,
642                     result);
643             cudaSafeCall( cudaGetLastError() );
644
645             if (stream == 0)
646                 cudaSafeCall( cudaDeviceSynchronize() );
647         }
648
649
650
651         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
652                 int w, int h, float weight,
653                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
654                 float templ_sqsum_scale,
655                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
656                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
657                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
658                 PtrStepSzf result)
659         {
660             const int x = blockIdx.x * blockDim.x + threadIdx.x;
661             const int y = blockIdx.y * blockDim.y + threadIdx.y;
662
663             if (x < result.cols && y < result.rows)
664             {
665                 float image_sum_r_ = (float)(
666                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
667                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
668                 float image_sqsum_r_ = (float)(
669                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
670                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
671                 float image_sum_g_ = (float)(
672                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
673                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
674                 float image_sqsum_g_ = (float)(
675                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
676                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
677                 float image_sum_b_ = (float)(
678                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
679                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
680                 float image_sqsum_b_ = (float)(
681                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
682                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
683
684                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
685                                              - image_sum_g_ * templ_sum_scale_g
686                                              - image_sum_b_ * templ_sum_scale_b;
687                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
688                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
689                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
690                 result.ptr(y)[x] = normAcc(num, denum);
691             }
692         }
693
694         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
695                     int w, int h,
696                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
697                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
698                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
699                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
700                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
701                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
702                     PtrStepSzf result, cudaStream_t stream)
703         {
704             dim3 threads(32, 8);
705             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
706
707             float weight = 1.f / (w * h);
708             float templ_sum_scale_r = templ_sum_r * weight;
709             float templ_sum_scale_g = templ_sum_g * weight;
710             float templ_sum_scale_b = templ_sum_b * weight;
711             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
712                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
713                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
714
715             matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
716                     w, h, weight,
717                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
718                     templ_sqsum_scale,
719                     image_sum_r, image_sqsum_r,
720                     image_sum_g, image_sqsum_g,
721                     image_sum_b, image_sqsum_b,
722                     result);
723             cudaSafeCall( cudaGetLastError() );
724
725             if (stream == 0)
726                 cudaSafeCall( cudaDeviceSynchronize() );
727         }
728
729
730
731         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
732                 int w, int h, float weight,
733                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
734                 float templ_sum_scale_a, float templ_sqsum_scale,
735                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
736                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
737                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
738                 const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
739                 PtrStepSzf result)
740         {
741             const int x = blockIdx.x * blockDim.x + threadIdx.x;
742             const int y = blockIdx.y * blockDim.y + threadIdx.y;
743
744             if (x < result.cols && y < result.rows)
745             {
746                 float image_sum_r_ = (float)(
747                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
748                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
749                 float image_sqsum_r_ = (float)(
750                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
751                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
752                 float image_sum_g_ = (float)(
753                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
754                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
755                 float image_sqsum_g_ = (float)(
756                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
757                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
758                 float image_sum_b_ = (float)(
759                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
760                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
761                 float image_sqsum_b_ = (float)(
762                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
763                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
764                 float image_sum_a_ = (float)(
765                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
766                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
767                 float image_sqsum_a_ = (float)(
768                         (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
769                         (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
770
771                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
772                                              - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
773                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
774                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
775                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
776                                                          + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
777                 result.ptr(y)[x] = normAcc(num, denum);
778             }
779         }
780
781         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
782                     int w, int h,
783                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
784                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
785                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
786                     const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
787                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
788                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
789                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
790                     unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
791                     PtrStepSzf result, cudaStream_t stream)
792         {
793             dim3 threads(32, 8);
794             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
795
796             float weight = 1.f / (w * h);
797             float templ_sum_scale_r = templ_sum_r * weight;
798             float templ_sum_scale_g = templ_sum_g * weight;
799             float templ_sum_scale_b = templ_sum_b * weight;
800             float templ_sum_scale_a = templ_sum_a * weight;
801             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
802                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
803                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
804                                       + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
805
806             matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
807                     w, h, weight,
808                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
809                     templ_sqsum_scale,
810                     image_sum_r, image_sqsum_r,
811                     image_sum_g, image_sqsum_g,
812                     image_sum_b, image_sqsum_b,
813                     image_sum_a, image_sqsum_a,
814                     result);
815             cudaSafeCall( cudaGetLastError() );
816
817             if (stream == 0)
818                 cudaSafeCall( cudaDeviceSynchronize() );
819         }
820
821         //////////////////////////////////////////////////////////////////////
822         // normalize
823
824         template <int cn>
825         __global__ void normalizeKernel_8U(
826                 int w, int h, const PtrStep<unsigned long long> image_sqsum,
827                 unsigned long long templ_sqsum, PtrStepSzf result)
828         {
829             const int x = blockIdx.x * blockDim.x + threadIdx.x;
830             const int y = blockIdx.y * blockDim.y + threadIdx.y;
831
832             if (x < result.cols && y < result.rows)
833             {
834                 float image_sqsum_ = (float)(
835                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
836                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
837                 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
838             }
839         }
840
841         void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
842                           unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
843         {
844             dim3 threads(32, 8);
845             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
846
847             switch (cn)
848             {
849             case 1:
850                 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
851                 break;
852             case 2:
853                 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
854                 break;
855             case 3:
856                 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
857                 break;
858             case 4:
859                 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
860                 break;
861             }
862
863             cudaSafeCall( cudaGetLastError() );
864
865             if (stream == 0)
866                 cudaSafeCall( cudaDeviceSynchronize() );
867         }
868
869         //////////////////////////////////////////////////////////////////////
870         // extractFirstChannel
871
872         template <int cn>
873         __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
874         {
875             typedef typename TypeVec<float, cn>::vec_type Typef;
876
877             int x = blockDim.x * blockIdx.x + threadIdx.x;
878             int y = blockDim.y * blockIdx.y + threadIdx.y;
879
880             if (x < result.cols && y < result.rows)
881             {
882                 Typef val = ((const Typef*)image.ptr(y))[x];
883                 result.ptr(y)[x] = first(val);
884             }
885         }
886
887         void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
888         {
889             dim3 threads(32, 8);
890             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
891
892             switch (cn)
893             {
894             case 1:
895                 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
896                 break;
897             case 2:
898                 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
899                 break;
900             case 3:
901                 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
902                 break;
903             case 4:
904                 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
905                 break;
906             }
907             cudaSafeCall( cudaGetLastError() );
908
909             if (stream == 0)
910                 cudaSafeCall( cudaDeviceSynchronize() );
911         }
912     } //namespace match_template
913 }}} // namespace cv { namespace gpu { namespace device
914
915
916 #endif /* CUDA_DISABLER */