1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/limits.hpp"
47 #include "opencv2/gpu/device/saturate_cast.hpp"
48 #include "opencv2/gpu/device/vec_traits.hpp"
49 #include "opencv2/gpu/device/vec_math.hpp"
50 #include "opencv2/gpu/device/reduce.hpp"
51 #include "opencv2/gpu/device/functional.hpp"
52 #include "opencv2/gpu/device/utility.hpp"
53 #include "opencv2/gpu/device/type_traits.hpp"
55 using namespace cv::gpu;
56 using namespace cv::gpu::device;
60 __device__ __forceinline__ int cvAtomicAdd(int* address, int val)
62 return ::atomicAdd(address, val);
64 __device__ __forceinline__ unsigned int cvAtomicAdd(unsigned int* address, unsigned int val)
66 return ::atomicAdd(address, val);
68 __device__ __forceinline__ float cvAtomicAdd(float* address, float val)
70 #if __CUDA_ARCH__ >= 200
71 return ::atomicAdd(address, val);
73 int* address_as_i = (int*) address;
74 int old = *address_as_i, assumed;
77 old = ::atomicCAS(address_as_i, assumed,
78 __float_as_int(val + __int_as_float(assumed)));
79 } while (assumed != old);
80 return __int_as_float(old);
83 __device__ __forceinline__ double cvAtomicAdd(double* address, double val)
85 #if __CUDA_ARCH__ >= 130
86 unsigned long long int* address_as_ull = (unsigned long long int*) address;
87 unsigned long long int old = *address_as_ull, assumed;
90 old = ::atomicCAS(address_as_ull, assumed,
91 __double_as_longlong(val + __longlong_as_double(assumed)));
92 } while (assumed != old);
93 return __longlong_as_double(old);
101 __device__ __forceinline__ int cvAtomicMin(int* address, int val)
103 return ::atomicMin(address, val);
105 __device__ __forceinline__ float cvAtomicMin(float* address, float val)
107 #if __CUDA_ARCH__ >= 120
108 int* address_as_i = (int*) address;
109 int old = *address_as_i, assumed;
112 old = ::atomicCAS(address_as_i, assumed,
113 __float_as_int(::fminf(val, __int_as_float(assumed))));
114 } while (assumed != old);
115 return __int_as_float(old);
122 __device__ __forceinline__ double cvAtomicMin(double* address, double val)
124 #if __CUDA_ARCH__ >= 130
125 unsigned long long int* address_as_ull = (unsigned long long int*) address;
126 unsigned long long int old = *address_as_ull, assumed;
129 old = ::atomicCAS(address_as_ull, assumed,
130 __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
131 } while (assumed != old);
132 return __longlong_as_double(old);
140 __device__ __forceinline__ int cvAtomicMax(int* address, int val)
142 return ::atomicMax(address, val);
144 __device__ __forceinline__ float cvAtomicMax(float* address, float val)
146 #if __CUDA_ARCH__ >= 120
147 int* address_as_i = (int*) address;
148 int old = *address_as_i, assumed;
151 old = ::atomicCAS(address_as_i, assumed,
152 __float_as_int(::fmaxf(val, __int_as_float(assumed))));
153 } while (assumed != old);
154 return __int_as_float(old);
161 __device__ __forceinline__ double cvAtomicMax(double* address, double val)
163 #if __CUDA_ARCH__ >= 130
164 unsigned long long int* address_as_ull = (unsigned long long int*) address;
165 unsigned long long int old = *address_as_ull, assumed;
168 old = ::atomicCAS(address_as_ull, assumed,
169 __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
170 } while (assumed != old);
171 return __longlong_as_double(old);
182 template <int cn> struct Unroll;
183 template <> struct Unroll<1>
185 template <int BLOCK_SIZE, typename R>
186 static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
191 template <typename R>
192 static __device__ __forceinline__ R& tie(R& val)
198 static __device__ __forceinline__ const Op& op(const Op& op)
203 template <> struct Unroll<2>
205 template <int BLOCK_SIZE, typename R>
206 static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
208 return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
211 template <typename R>
212 static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
214 return thrust::tie(val.x, val.y);
218 static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
220 return thrust::make_tuple(op, op);
223 template <> struct Unroll<3>
225 template <int BLOCK_SIZE, typename R>
226 static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
228 return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
231 template <typename R>
232 static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
234 return thrust::tie(val.x, val.y, val.z);
238 static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
240 return thrust::make_tuple(op, op, op);
243 template <> struct Unroll<4>
245 template <int BLOCK_SIZE, typename R>
246 static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
248 return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
251 template <typename R>
252 static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
254 return thrust::tie(val.x, val.y, val.z, val.w);
258 static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
260 return thrust::make_tuple(op, op, op, op);
265 /////////////////////////////////////////////////////////////
270 __device__ unsigned int blocks_finished = 0;
272 template <typename R, int cn> struct AtomicAdd;
273 template <typename R> struct AtomicAdd<R, 1>
275 static __device__ void run(R* ptr, R val)
277 detail::cvAtomicAdd(ptr, val);
280 template <typename R> struct AtomicAdd<R, 2>
282 typedef typename TypeVec<R, 2>::vec_type val_type;
284 static __device__ void run(R* ptr, val_type val)
286 detail::cvAtomicAdd(ptr, val.x);
287 detail::cvAtomicAdd(ptr + 1, val.y);
290 template <typename R> struct AtomicAdd<R, 3>
292 typedef typename TypeVec<R, 3>::vec_type val_type;
294 static __device__ void run(R* ptr, val_type val)
296 detail::cvAtomicAdd(ptr, val.x);
297 detail::cvAtomicAdd(ptr + 1, val.y);
298 detail::cvAtomicAdd(ptr + 2, val.z);
301 template <typename R> struct AtomicAdd<R, 4>
303 typedef typename TypeVec<R, 4>::vec_type val_type;
305 static __device__ void run(R* ptr, val_type val)
307 detail::cvAtomicAdd(ptr, val.x);
308 detail::cvAtomicAdd(ptr + 1, val.y);
309 detail::cvAtomicAdd(ptr + 2, val.z);
310 detail::cvAtomicAdd(ptr + 3, val.w);
314 template <int BLOCK_SIZE, typename R, int cn>
317 typedef typename TypeVec<R, cn>::vec_type result_type;
319 static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
321 #if __CUDA_ARCH__ >= 200
323 AtomicAdd<R, cn>::run((R*) result, sum);
325 __shared__ bool is_last;
333 unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
334 is_last = (ticket == gridDim.x * gridDim.y - 1);
341 sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
343 device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
355 template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
356 __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
358 typedef typename VecTraits<src_type>::elem_type T;
359 typedef typename VecTraits<result_type>::elem_type R;
360 const int cn = VecTraits<src_type>::cn;
362 __shared__ R smem[BLOCK_SIZE * cn];
364 const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
365 const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
367 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
368 const int bid = blockIdx.y * gridDim.x + blockIdx.x;
370 result_type sum = VecTraits<result_type>::all(0);
372 for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
374 const src_type* ptr = src.ptr(y);
376 for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
378 const src_type srcVal = ptr[x];
380 sum = sum + op(saturate_cast<result_type>(srcVal));
384 device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
386 GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
389 const int threads_x = 32;
390 const int threads_y = 8;
392 void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
394 block = dim3(threads_x, threads_y);
396 grid = dim3(divUp(cols, block.x * block.y),
397 divUp(rows, block.y * block.x));
399 grid.x = ::min(grid.x, block.x);
400 grid.y = ::min(grid.y, block.y);
403 void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
406 getLaunchCfg(cols, rows, block, grid);
408 bufcols = grid.x * grid.y * sizeof(double) * cn;
412 template <typename T, typename R, int cn, template <typename> class Op>
413 void caller(PtrStepSzb src_, void* buf_, double* out)
415 typedef typename TypeVec<T, cn>::vec_type src_type;
416 typedef typename TypeVec<R, cn>::vec_type result_type;
418 PtrStepSz<src_type> src(src_);
419 result_type* buf = (result_type*) buf_;
422 getLaunchCfg(src.cols, src.rows, block, grid);
424 const int twidth = divUp(divUp(src.cols, grid.x), block.x);
425 const int theight = divUp(divUp(src.rows, grid.y), block.y);
429 kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
430 cudaSafeCall( cudaGetLastError() );
432 cudaSafeCall( cudaDeviceSynchronize() );
434 R result[4] = {0, 0, 0, 0};
435 cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
443 template <typename T> struct SumType;
444 template <> struct SumType<uchar> { typedef unsigned int R; };
445 template <> struct SumType<schar> { typedef int R; };
446 template <> struct SumType<ushort> { typedef unsigned int R; };
447 template <> struct SumType<short> { typedef int R; };
448 template <> struct SumType<int> { typedef int R; };
449 template <> struct SumType<float> { typedef float R; };
450 template <> struct SumType<double> { typedef double R; };
452 template <typename T, int cn>
453 void run(PtrStepSzb src, void* buf, double* out)
455 typedef typename SumType<T>::R R;
456 caller<T, R, cn, identity>(src, buf, out);
459 template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
460 template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
461 template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
462 template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
464 template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
465 template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
466 template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
467 template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
469 template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
470 template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
471 template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
472 template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
474 template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
475 template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
476 template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
477 template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
479 template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
480 template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
481 template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
482 template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
484 template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
485 template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
486 template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
487 template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
489 template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
490 template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
491 template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
492 template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
494 template <typename T, int cn>
495 void runAbs(PtrStepSzb src, void* buf, double* out)
497 typedef typename SumType<T>::R R;
498 caller<T, R, cn, abs_func>(src, buf, out);
501 template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
502 template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
503 template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
504 template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
506 template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
507 template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
508 template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
509 template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
511 template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
512 template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
513 template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
514 template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
516 template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
517 template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
518 template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
519 template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
521 template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
522 template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
523 template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
524 template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
526 template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
527 template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
528 template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
529 template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
531 template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
532 template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
533 template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
534 template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
536 template <typename T> struct Sqr : unary_function<T, T>
538 __device__ __forceinline__ T operator ()(T x) const
544 template <typename T, int cn>
545 void runSqr(PtrStepSzb src, void* buf, double* out)
547 caller<T, double, cn, Sqr>(src, buf, out);
550 template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
551 template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
552 template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
553 template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
555 template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
556 template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
557 template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
558 template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
560 template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
561 template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
562 template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
563 template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
565 template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
566 template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
567 template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
568 template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
570 template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
571 template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
572 template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
573 template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
575 template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
576 template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
577 template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
578 template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
580 template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
581 template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
582 template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
583 template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
586 /////////////////////////////////////////////////////////////
591 __device__ unsigned int blocks_finished = 0;
593 // To avoid shared bank conflicts we convert each value into value of
594 // appropriate type (32 bits minimum)
595 template <typename T> struct MinMaxTypeTraits;
596 template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
597 template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
598 template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
599 template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
600 template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
601 template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
602 template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
604 template <int BLOCK_SIZE, typename R>
607 static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
609 #if __CUDA_ARCH__ >= 200
612 detail::cvAtomicMin(minval, mymin);
613 detail::cvAtomicMax(maxval, mymax);
616 __shared__ bool is_last;
625 unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
626 is_last = (ticket == gridDim.x * gridDim.y - 1);
633 int idx = ::min(tid, gridDim.x * gridDim.y - 1);
638 const minimum<R> minOp;
639 const maximum<R> maxOp;
640 device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
654 template <int BLOCK_SIZE, typename T, typename R, class Mask>
655 __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
657 __shared__ R sminval[BLOCK_SIZE];
658 __shared__ R smaxval[BLOCK_SIZE];
660 const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
661 const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
663 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
664 const int bid = blockIdx.y * gridDim.x + blockIdx.x;
666 R mymin = numeric_limits<R>::max();
667 R mymax = -numeric_limits<R>::max();
669 const minimum<R> minOp;
670 const maximum<R> maxOp;
672 for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
674 const T* ptr = src.ptr(y);
676 for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
680 const R srcVal = ptr[x];
682 mymin = minOp(mymin, srcVal);
683 mymax = maxOp(mymax, srcVal);
688 device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
690 GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
693 const int threads_x = 32;
694 const int threads_y = 8;
696 void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
698 block = dim3(threads_x, threads_y);
700 grid = dim3(divUp(cols, block.x * block.y),
701 divUp(rows, block.y * block.x));
703 grid.x = ::min(grid.x, block.x);
704 grid.y = ::min(grid.y, block.y);
707 void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
710 getLaunchCfg(cols, rows, block, grid);
712 bufcols = grid.x * grid.y * sizeof(double);
716 __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
718 *minval_buf = numeric_limits<int>::max();
719 *maxval_buf = numeric_limits<int>::min();
721 __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
723 *minval_buf = numeric_limits<float>::max();
724 *maxval_buf = -numeric_limits<float>::max();
726 __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
728 *minval_buf = numeric_limits<double>::max();
729 *maxval_buf = -numeric_limits<double>::max();
732 template <typename R>
733 void setDefault(R* minval_buf, R* maxval_buf)
735 setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
738 template <typename T>
739 void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
741 typedef typename MinMaxTypeTraits<T>::best_type R;
744 getLaunchCfg(src.cols, src.rows, block, grid);
746 const int twidth = divUp(divUp(src.cols, grid.x), block.x);
747 const int theight = divUp(divUp(src.rows, grid.y), block.y);
749 R* minval_buf = (R*) buf.ptr(0);
750 R* maxval_buf = (R*) buf.ptr(1);
752 setDefault(minval_buf, maxval_buf);
755 kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
757 kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
759 cudaSafeCall( cudaGetLastError() );
761 cudaSafeCall( cudaDeviceSynchronize() );
764 cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
765 cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
770 template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
771 template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
772 template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
773 template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
774 template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
775 template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
776 template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
779 /////////////////////////////////////////////////////////////
784 // To avoid shared bank conflicts we convert each value into value of
785 // appropriate type (32 bits minimum)
786 template <typename T> struct MinMaxTypeTraits;
787 template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
788 template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
789 template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
790 template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
791 template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
792 template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
793 template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
795 template <int BLOCK_SIZE, typename T, class Mask>
796 __global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
798 typedef typename MinMaxTypeTraits<T>::best_type work_type;
800 __shared__ work_type sminval[BLOCK_SIZE];
801 __shared__ work_type smaxval[BLOCK_SIZE];
802 __shared__ unsigned int sminloc[BLOCK_SIZE];
803 __shared__ unsigned int smaxloc[BLOCK_SIZE];
805 const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
806 const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
808 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
809 const int bid = blockIdx.y * gridDim.x + blockIdx.x;
811 work_type mymin = numeric_limits<work_type>::max();
812 work_type mymax = -numeric_limits<work_type>::max();
813 unsigned int myminloc = 0;
814 unsigned int mymaxloc = 0;
816 for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
818 const T* ptr = src.ptr(y);
820 for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
824 const work_type srcVal = ptr[x];
829 myminloc = y * src.cols + x;
835 mymaxloc = y * src.cols + x;
841 reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
842 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
844 thrust::make_tuple(less<work_type>(), greater<work_type>()));
848 minval[bid] = (T) mymin;
849 maxval[bid] = (T) mymax;
850 minloc[bid] = myminloc;
851 maxloc[bid] = mymaxloc;
854 template <int BLOCK_SIZE, typename T>
855 __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
857 typedef typename MinMaxTypeTraits<T>::best_type work_type;
859 __shared__ work_type sminval[BLOCK_SIZE];
860 __shared__ work_type smaxval[BLOCK_SIZE];
861 __shared__ unsigned int sminloc[BLOCK_SIZE];
862 __shared__ unsigned int smaxloc[BLOCK_SIZE];
864 unsigned int idx = ::min(threadIdx.x, count - 1);
866 work_type mymin = minval[idx];
867 work_type mymax = maxval[idx];
868 unsigned int myminloc = minloc[idx];
869 unsigned int mymaxloc = maxloc[idx];
871 reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
872 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
874 thrust::make_tuple(less<work_type>(), greater<work_type>()));
876 if (threadIdx.x == 0)
878 minval[0] = (T) mymin;
879 maxval[0] = (T) mymax;
880 minloc[0] = myminloc;
881 maxloc[0] = mymaxloc;
885 const int threads_x = 32;
886 const int threads_y = 8;
888 void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
890 block = dim3(threads_x, threads_y);
892 grid = dim3(divUp(cols, block.x * block.y),
893 divUp(rows, block.y * block.x));
895 grid.x = ::min(grid.x, block.x);
896 grid.y = ::min(grid.y, block.y);
899 void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
902 getLaunchCfg(cols, rows, block, grid);
905 b1cols = (int)(grid.x * grid.y * elem_size);
909 b2cols = grid.x * grid.y * sizeof(int);
913 template <typename T>
914 void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
917 getLaunchCfg(src.cols, src.rows, block, grid);
919 const int twidth = divUp(divUp(src.cols, grid.x), block.x);
920 const int theight = divUp(divUp(src.rows, grid.y), block.y);
922 T* minval_buf = (T*) valbuf.ptr(0);
923 T* maxval_buf = (T*) valbuf.ptr(1);
924 unsigned int* minloc_buf = locbuf.ptr(0);
925 unsigned int* maxloc_buf = locbuf.ptr(1);
928 kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
930 kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
932 cudaSafeCall( cudaGetLastError() );
934 kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
935 cudaSafeCall( cudaGetLastError() );
937 cudaSafeCall( cudaDeviceSynchronize() );
940 cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
941 cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
945 unsigned int minloc_, maxloc_;
946 cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
947 cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
948 minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
949 maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
952 template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
953 template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
954 template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
955 template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
956 template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
957 template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
958 template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
961 /////////////////////////////////////////////////////////////
964 namespace countNonZero
966 __device__ unsigned int blocks_finished = 0;
968 template <int BLOCK_SIZE, typename T>
969 __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
971 __shared__ unsigned int scount[BLOCK_SIZE];
973 const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
974 const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
976 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
978 unsigned int mycount = 0;
980 for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
982 const T* ptr = src.ptr(y);
984 for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
986 const T srcVal = ptr[x];
988 mycount += (srcVal != 0);
992 device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
994 #if __CUDA_ARCH__ >= 200
996 ::atomicAdd(count, mycount);
998 __shared__ bool is_last;
999 const int bid = blockIdx.y * gridDim.x + blockIdx.x;
1003 count[bid] = mycount;
1007 unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
1008 is_last = (ticket == gridDim.x * gridDim.y - 1);
1015 mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
1017 device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
1023 blocks_finished = 0;
1029 const int threads_x = 32;
1030 const int threads_y = 8;
1032 void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
1034 block = dim3(threads_x, threads_y);
1036 grid = dim3(divUp(cols, block.x * block.y),
1037 divUp(rows, block.y * block.x));
1039 grid.x = ::min(grid.x, block.x);
1040 grid.y = ::min(grid.y, block.y);
1043 void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
1046 getLaunchCfg(cols, rows, block, grid);
1048 bufcols = grid.x * grid.y * sizeof(int);
1052 template <typename T>
1053 int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
1056 getLaunchCfg(src.cols, src.rows, block, grid);
1058 const int twidth = divUp(divUp(src.cols, grid.x), block.x);
1059 const int theight = divUp(divUp(src.rows, grid.y), block.y);
1061 unsigned int* count_buf = buf.ptr(0);
1063 cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
1065 kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
1066 cudaSafeCall( cudaGetLastError() );
1068 cudaSafeCall( cudaDeviceSynchronize() );
1071 cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
1076 template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1077 template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1078 template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
1079 template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1080 template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1081 template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1082 template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
1085 //////////////////////////////////////////////////////////////////////////////
1092 template <typename T>
1093 __device__ __forceinline__ T startValue() const
1095 return VecTraits<T>::all(0);
1098 template <typename T>
1099 __device__ __forceinline__ T operator ()(T a, T b) const
1104 template <typename T>
1105 __device__ __forceinline__ T result(T r, double) const
1110 __device__ __forceinline__ Sum() {}
1111 __device__ __forceinline__ Sum(const Sum&) {}
1116 template <typename T>
1117 __device__ __forceinline__ T startValue() const
1119 return VecTraits<T>::all(0);
1122 template <typename T>
1123 __device__ __forceinline__ T operator ()(T a, T b) const
1128 template <typename T>
1129 __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
1134 __device__ __forceinline__ Avg() {}
1135 __device__ __forceinline__ Avg(const Avg&) {}
1140 template <typename T>
1141 __device__ __forceinline__ T startValue() const
1143 return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
1146 template <typename T>
1147 __device__ __forceinline__ T operator ()(T a, T b) const
1153 template <typename T>
1154 __device__ __forceinline__ T result(T r, double) const
1159 __device__ __forceinline__ Min() {}
1160 __device__ __forceinline__ Min(const Min&) {}
1165 template <typename T>
1166 __device__ __forceinline__ T startValue() const
1168 return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
1171 template <typename T>
1172 __device__ __forceinline__ T operator ()(T a, T b) const
1178 template <typename T>
1179 __device__ __forceinline__ T result(T r, double) const
1184 __device__ __forceinline__ Max() {}
1185 __device__ __forceinline__ Max(const Max&) {}
1188 ///////////////////////////////////////////////////////////
1190 template <typename T, typename S, typename D, class Op>
1191 __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
1193 __shared__ S smem[16 * 16];
1195 const int x = blockIdx.x * 16 + threadIdx.x;
1197 S myVal = op.template startValue<S>();
1201 for (int y = threadIdx.y; y < src.rows; y += 16)
1203 S srcVal = src(y, x);
1204 myVal = op(myVal, srcVal);
1208 smem[threadIdx.x * 16 + threadIdx.y] = myVal;
1212 volatile S* srow = smem + threadIdx.y * 16;
1214 myVal = srow[threadIdx.x];
1215 device::reduce<16>(srow, myVal, threadIdx.x, op);
1217 if (threadIdx.x == 0)
1222 if (threadIdx.y == 0 && x < src.cols)
1223 dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
1226 template <typename T, typename S, typename D, class Op>
1227 void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
1229 const dim3 block(16, 16);
1230 const dim3 grid(divUp(src.cols, block.x));
1233 rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
1234 cudaSafeCall( cudaGetLastError() );
1237 cudaSafeCall( cudaDeviceSynchronize() );
1240 template <typename T, typename S, typename D>
1241 void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
1243 typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
1244 static const func_t funcs[] =
1246 rowsCaller<T, S, D, Sum>,
1247 rowsCaller<T, S, D, Avg>,
1248 rowsCaller<T, S, D, Max>,
1249 rowsCaller<T, S, D, Min>
1252 funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
1255 template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1256 template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1257 template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1258 template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1260 template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1261 template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1262 template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1263 template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1265 template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1266 template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1267 template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1268 template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1270 template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1271 template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1272 template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1274 template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1275 template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1277 template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1279 ///////////////////////////////////////////////////////////
1281 template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
1282 __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
1284 typedef typename TypeVec<T, cn>::vec_type src_type;
1285 typedef typename TypeVec<S, cn>::vec_type work_type;
1286 typedef typename TypeVec<D, cn>::vec_type dst_type;
1288 __shared__ S smem[BLOCK_SIZE * cn];
1290 const int y = blockIdx.x;
1292 const src_type* srcRow = src.ptr(y);
1294 work_type myVal = op.template startValue<work_type>();
1296 for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
1297 myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
1299 device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
1301 if (threadIdx.x == 0)
1302 dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
1305 template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
1307 const int BLOCK_SIZE = 256;
1309 const dim3 block(BLOCK_SIZE);
1310 const dim3 grid(src.rows);
1313 colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
1314 cudaSafeCall( cudaGetLastError() );
1317 cudaSafeCall( cudaDeviceSynchronize() );
1321 template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
1323 typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
1324 static const func_t funcs[5][4] =
1327 {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
1328 {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
1329 {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
1330 {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
1333 funcs[cn][op](src, dst, stream);
1336 template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1337 template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1338 template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1339 template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1341 template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1342 template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1343 template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1344 template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1346 template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1347 template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1348 template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1349 template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1351 template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1352 template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1353 template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1355 template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1356 template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1358 template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1361 #endif /* CUDA_DISABLER */