modules/gpu/src/cuda/matrix_reductions.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/limits.hpp"
  47 #include "opencv2/gpu/device/saturate_cast.hpp"
  48 #include "opencv2/gpu/device/vec_traits.hpp"
  49 #include "opencv2/gpu/device/vec_math.hpp"
  50 #include "opencv2/gpu/device/reduce.hpp"
  51 #include "opencv2/gpu/device/functional.hpp"
  52 #include "opencv2/gpu/device/utility.hpp"
  53 #include "opencv2/gpu/device/type_traits.hpp"
  54
  55 using namespace cv::gpu;
  56 using namespace cv::gpu::device;
  57
  58 namespace detail
  59 {
  60     __device__ __forceinline__ int cvAtomicAdd(int* address, int val)
  61     {
  62         return ::atomicAdd(address, val);
  63     }
  64     __device__ __forceinline__ unsigned int cvAtomicAdd(unsigned int* address, unsigned int val)
  65     {
  66         return ::atomicAdd(address, val);
  67     }
  68     __device__ __forceinline__ float cvAtomicAdd(float* address, float val)
  69     {
  70     #if __CUDA_ARCH__ >= 200
  71         return ::atomicAdd(address, val);
  72     #else
  73         int* address_as_i = (int*) address;
  74         int old = *address_as_i, assumed;
  75         do {
  76             assumed = old;
  77             old = ::atomicCAS(address_as_i, assumed,
  78                 __float_as_int(val + __int_as_float(assumed)));
  79         } while (assumed != old);
  80         return __int_as_float(old);
  81     #endif
  82     }
  83     __device__ __forceinline__ double cvAtomicAdd(double* address, double val)
  84     {
  85     #if __CUDA_ARCH__ >= 130
  86         unsigned long long int* address_as_ull = (unsigned long long int*) address;
  87         unsigned long long int old = *address_as_ull, assumed;
  88         do {
  89             assumed = old;
  90             old = ::atomicCAS(address_as_ull, assumed,
  91                 __double_as_longlong(val + __longlong_as_double(assumed)));
  92         } while (assumed != old);
  93         return __longlong_as_double(old);
  94     #else
  95         (void) address;
  96         (void) val;
  97         return 0.0;
  98     #endif
  99     }
 100
 101     __device__ __forceinline__ int cvAtomicMin(int* address, int val)
 102     {
 103         return ::atomicMin(address, val);
 104     }
 105     __device__ __forceinline__ float cvAtomicMin(float* address, float val)
 106     {
 107     #if __CUDA_ARCH__ >= 120
 108         int* address_as_i = (int*) address;
 109         int old = *address_as_i, assumed;
 110         do {
 111             assumed = old;
 112             old = ::atomicCAS(address_as_i, assumed,
 113                 __float_as_int(::fminf(val, __int_as_float(assumed))));
 114         } while (assumed != old);
 115         return __int_as_float(old);
 116     #else
 117         (void) address;
 118         (void) val;
 119         return 0.0f;
 120     #endif
 121     }
 122     __device__ __forceinline__ double cvAtomicMin(double* address, double val)
 123     {
 124     #if __CUDA_ARCH__ >= 130
 125         unsigned long long int* address_as_ull = (unsigned long long int*) address;
 126         unsigned long long int old = *address_as_ull, assumed;
 127         do {
 128             assumed = old;
 129             old = ::atomicCAS(address_as_ull, assumed,
 130                 __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
 131         } while (assumed != old);
 132         return __longlong_as_double(old);
 133     #else
 134         (void) address;
 135         (void) val;
 136         return 0.0;
 137     #endif
 138     }
 139
 140     __device__ __forceinline__ int cvAtomicMax(int* address, int val)
 141     {
 142         return ::atomicMax(address, val);
 143     }
 144     __device__ __forceinline__ float cvAtomicMax(float* address, float val)
 145     {
 146     #if __CUDA_ARCH__ >= 120
 147         int* address_as_i = (int*) address;
 148         int old = *address_as_i, assumed;
 149         do {
 150             assumed = old;
 151             old = ::atomicCAS(address_as_i, assumed,
 152                 __float_as_int(::fmaxf(val, __int_as_float(assumed))));
 153         } while (assumed != old);
 154         return __int_as_float(old);
 155     #else
 156         (void) address;
 157         (void) val;
 158         return 0.0f;
 159     #endif
 160     }
 161     __device__ __forceinline__ double cvAtomicMax(double* address, double val)
 162     {
 163     #if __CUDA_ARCH__ >= 130
 164         unsigned long long int* address_as_ull = (unsigned long long int*) address;
 165         unsigned long long int old = *address_as_ull, assumed;
 166         do {
 167             assumed = old;
 168             old = ::atomicCAS(address_as_ull, assumed,
 169                 __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
 170         } while (assumed != old);
 171         return __longlong_as_double(old);
 172     #else
 173         (void) address;
 174         (void) val;
 175         return 0.0;
 176     #endif
 177     }
 178 }
 179
 180 namespace detail
 181 {
 182     template <int cn> struct Unroll;
 183     template <> struct Unroll<1>
 184     {
 185         template <int BLOCK_SIZE, typename R>
 186         static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
 187         {
 188             return smem;
 189         }
 190
 191         template <typename R>
 192         static __device__ __forceinline__ R& tie(R& val)
 193         {
 194             return val;
 195         }
 196
 197         template <class Op>
 198         static __device__ __forceinline__ const Op& op(const Op& op)
 199         {
 200             return op;
 201         }
 202     };
 203     template <> struct Unroll<2>
 204     {
 205         template <int BLOCK_SIZE, typename R>
 206         static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
 207         {
 208             return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
 209         }
 210
 211         template <typename R>
 212         static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
 213         {
 214             return thrust::tie(val.x, val.y);
 215         }
 216
 217         template <class Op>
 218         static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
 219         {
 220             return thrust::make_tuple(op, op);
 221         }
 222     };
 223     template <> struct Unroll<3>
 224     {
 225         template <int BLOCK_SIZE, typename R>
 226         static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
 227         {
 228             return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
 229         }
 230
 231         template <typename R>
 232         static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
 233         {
 234             return thrust::tie(val.x, val.y, val.z);
 235         }
 236
 237         template <class Op>
 238         static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
 239         {
 240             return thrust::make_tuple(op, op, op);
 241         }
 242     };
 243     template <> struct Unroll<4>
 244     {
 245         template <int BLOCK_SIZE, typename R>
 246         static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
 247         {
 248             return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
 249         }
 250
 251         template <typename R>
 252         static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
 253         {
 254             return thrust::tie(val.x, val.y, val.z, val.w);
 255         }
 256
 257         template <class Op>
 258         static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
 259         {
 260             return thrust::make_tuple(op, op, op, op);
 261         }
 262     };
 263 }
 264
 265 /////////////////////////////////////////////////////////////
 266 // sum
 267
 268 namespace sum
 269 {
 270     __device__ unsigned int blocks_finished = 0;
 271
 272     template <typename R, int cn> struct AtomicAdd;
 273     template <typename R> struct AtomicAdd<R, 1>
 274     {
 275         static __device__ void run(R* ptr, R val)
 276         {
 277             detail::cvAtomicAdd(ptr, val);
 278         }
 279     };
 280     template <typename R> struct AtomicAdd<R, 2>
 281     {
 282         typedef typename TypeVec<R, 2>::vec_type val_type;
 283
 284         static __device__ void run(R* ptr, val_type val)
 285         {
 286             detail::cvAtomicAdd(ptr, val.x);
 287             detail::cvAtomicAdd(ptr + 1, val.y);
 288         }
 289     };
 290     template <typename R> struct AtomicAdd<R, 3>
 291     {
 292         typedef typename TypeVec<R, 3>::vec_type val_type;
 293
 294         static __device__ void run(R* ptr, val_type val)
 295         {
 296             detail::cvAtomicAdd(ptr, val.x);
 297             detail::cvAtomicAdd(ptr + 1, val.y);
 298             detail::cvAtomicAdd(ptr + 2, val.z);
 299         }
 300     };
 301     template <typename R> struct AtomicAdd<R, 4>
 302     {
 303         typedef typename TypeVec<R, 4>::vec_type val_type;
 304
 305         static __device__ void run(R* ptr, val_type val)
 306         {
 307             detail::cvAtomicAdd(ptr, val.x);
 308             detail::cvAtomicAdd(ptr + 1, val.y);
 309             detail::cvAtomicAdd(ptr + 2, val.z);
 310             detail::cvAtomicAdd(ptr + 3, val.w);
 311         }
 312     };
 313
 314     template <int BLOCK_SIZE, typename R, int cn>
 315     struct GlobalReduce
 316     {
 317         typedef typename TypeVec<R, cn>::vec_type result_type;
 318
 319         static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
 320         {
 321         #if __CUDA_ARCH__ >= 200
 322             if (tid == 0)
 323                 AtomicAdd<R, cn>::run((R*) result, sum);
 324         #else
 325             __shared__ bool is_last;
 326
 327             if (tid == 0)
 328             {
 329                 result[bid] = sum;
 330
 331                 __threadfence();
 332
 333                 unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
 334                 is_last = (ticket == gridDim.x * gridDim.y - 1);
 335             }
 336
 337             __syncthreads();
 338
 339             if (is_last)
 340             {
 341                 sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
 342
 343                 device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
 344
 345                 if (tid == 0)
 346                 {
 347                     result[0] = sum;
 348                     blocks_finished = 0;
 349                 }
 350             }
 351         #endif
 352         }
 353     };
 354
 355     template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
 356     __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
 357     {
 358         typedef typename VecTraits<src_type>::elem_type T;
 359         typedef typename VecTraits<result_type>::elem_type R;
 360         const int cn = VecTraits<src_type>::cn;
 361
 362         __shared__ R smem[BLOCK_SIZE * cn];
 363
 364         const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
 365         const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
 366
 367         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 368         const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 369
 370         result_type sum = VecTraits<result_type>::all(0);
 371
 372         for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
 373         {
 374             const src_type* ptr = src.ptr(y);
 375
 376             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
 377             {
 378                 const src_type srcVal = ptr[x];
 379
 380                 sum = sum + op(saturate_cast<result_type>(srcVal));
 381             }
 382         }
 383
 384         device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
 385
 386         GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
 387     }
 388
 389     const int threads_x = 32;
 390     const int threads_y = 8;
 391
 392     void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
 393     {
 394         block = dim3(threads_x, threads_y);
 395
 396         grid = dim3(divUp(cols, block.x * block.y),
 397                     divUp(rows, block.y * block.x));
 398
 399         grid.x = ::min(grid.x, block.x);
 400         grid.y = ::min(grid.y, block.y);
 401     }
 402
 403     void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
 404     {
 405         dim3 block, grid;
 406         getLaunchCfg(cols, rows, block, grid);
 407
 408         bufcols = grid.x * grid.y * sizeof(double) * cn;
 409         bufrows = 1;
 410     }
 411
 412     template <typename T, typename R, int cn, template <typename> class Op>
 413     void caller(PtrStepSzb src_, void* buf_, double* out)
 414     {
 415         typedef typename TypeVec<T, cn>::vec_type src_type;
 416         typedef typename TypeVec<R, cn>::vec_type result_type;
 417
 418         PtrStepSz<src_type> src(src_);
 419         result_type* buf = (result_type*) buf_;
 420
 421         dim3 block, grid;
 422         getLaunchCfg(src.cols, src.rows, block, grid);
 423
 424         const int twidth = divUp(divUp(src.cols, grid.x), block.x);
 425         const int theight = divUp(divUp(src.rows, grid.y), block.y);
 426
 427         Op<result_type> op;
 428
 429         kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
 430         cudaSafeCall( cudaGetLastError() );
 431
 432         cudaSafeCall( cudaDeviceSynchronize() );
 433
 434         R result[4] = {0, 0, 0, 0};
 435         cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
 436
 437         out[0] = result[0];
 438         out[1] = result[1];
 439         out[2] = result[2];
 440         out[3] = result[3];
 441     }
 442
 443     template <typename T> struct SumType;
 444     template <> struct SumType<uchar> { typedef unsigned int R; };
 445     template <> struct SumType<schar> { typedef int R; };
 446     template <> struct SumType<ushort> { typedef unsigned int R; };
 447     template <> struct SumType<short> { typedef int R; };
 448     template <> struct SumType<int> { typedef int R; };
 449     template <> struct SumType<float> { typedef float R; };
 450     template <> struct SumType<double> { typedef double R; };
 451
 452     template <typename T, int cn>
 453     void run(PtrStepSzb src, void* buf, double* out)
 454     {
 455         typedef typename SumType<T>::R R;
 456         caller<T, R, cn, identity>(src, buf, out);
 457     }
 458
 459     template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
 460     template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
 461     template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
 462     template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
 463
 464     template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
 465     template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
 466     template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
 467     template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
 468
 469     template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
 470     template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
 471     template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
 472     template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
 473
 474     template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
 475     template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
 476     template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
 477     template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
 478
 479     template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
 480     template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
 481     template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
 482     template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
 483
 484     template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
 485     template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
 486     template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
 487     template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
 488
 489     template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
 490     template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
 491     template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
 492     template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
 493
 494     template <typename T, int cn>
 495     void runAbs(PtrStepSzb src, void* buf, double* out)
 496     {
 497         typedef typename SumType<T>::R R;
 498         caller<T, R, cn, abs_func>(src, buf, out);
 499     }
 500
 501     template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
 502     template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
 503     template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
 504     template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
 505
 506     template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
 507     template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
 508     template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
 509     template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
 510
 511     template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
 512     template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
 513     template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
 514     template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
 515
 516     template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
 517     template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
 518     template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
 519     template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
 520
 521     template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
 522     template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
 523     template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
 524     template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
 525
 526     template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
 527     template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
 528     template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
 529     template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
 530
 531     template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
 532     template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
 533     template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
 534     template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
 535
 536     template <typename T> struct Sqr : unary_function<T, T>
 537     {
 538         __device__ __forceinline__ T operator ()(T x) const
 539         {
 540             return x * x;
 541         }
 542     };
 543
 544     template <typename T, int cn>
 545     void runSqr(PtrStepSzb src, void* buf, double* out)
 546     {
 547         caller<T, double, cn, Sqr>(src, buf, out);
 548     }
 549
 550     template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
 551     template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
 552     template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
 553     template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
 554
 555     template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
 556     template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
 557     template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
 558     template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
 559
 560     template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
 561     template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
 562     template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
 563     template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
 564
 565     template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
 566     template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
 567     template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
 568     template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
 569
 570     template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
 571     template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
 572     template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
 573     template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
 574
 575     template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
 576     template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
 577     template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
 578     template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
 579
 580     template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
 581     template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
 582     template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
 583     template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
 584 }
 585
 586 /////////////////////////////////////////////////////////////
 587 // minMax
 588
 589 namespace minMax
 590 {
 591     __device__ unsigned int blocks_finished = 0;
 592
 593     // To avoid shared bank conflicts we convert each value into value of
 594     // appropriate type (32 bits minimum)
 595     template <typename T> struct MinMaxTypeTraits;
 596     template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
 597     template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
 598     template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
 599     template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
 600     template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
 601     template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
 602     template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
 603
 604     template <int BLOCK_SIZE, typename R>
 605     struct GlobalReduce
 606     {
 607         static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
 608         {
 609         #if __CUDA_ARCH__ >= 200
 610             if (tid == 0)
 611             {
 612                 detail::cvAtomicMin(minval, mymin);
 613                 detail::cvAtomicMax(maxval, mymax);
 614             }
 615         #else
 616             __shared__ bool is_last;
 617
 618             if (tid == 0)
 619             {
 620                 minval[bid] = mymin;
 621                 maxval[bid] = mymax;
 622
 623                 __threadfence();
 624
 625                 unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
 626                 is_last = (ticket == gridDim.x * gridDim.y - 1);
 627             }
 628
 629             __syncthreads();
 630
 631             if (is_last)
 632             {
 633                 int idx = ::min(tid, gridDim.x * gridDim.y - 1);
 634
 635                 mymin = minval[idx];
 636                 mymax = maxval[idx];
 637
 638                 const minimum<R> minOp;
 639                 const maximum<R> maxOp;
 640                 device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
 641
 642                 if (tid == 0)
 643                 {
 644                     minval[0] = mymin;
 645                     maxval[0] = mymax;
 646
 647                     blocks_finished = 0;
 648                 }
 649             }
 650         #endif
 651         }
 652     };
 653
 654     template <int BLOCK_SIZE, typename T, typename R, class Mask>
 655     __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
 656     {
 657         __shared__ R sminval[BLOCK_SIZE];
 658         __shared__ R smaxval[BLOCK_SIZE];
 659
 660         const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
 661         const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
 662
 663         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 664         const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 665
 666         R mymin = numeric_limits<R>::max();
 667         R mymax = -numeric_limits<R>::max();
 668
 669         const minimum<R> minOp;
 670         const maximum<R> maxOp;
 671
 672         for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
 673         {
 674             const T* ptr = src.ptr(y);
 675
 676             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
 677             {
 678                 if (mask(y, x))
 679                 {
 680                     const R srcVal = ptr[x];
 681
 682                     mymin = minOp(mymin, srcVal);
 683                     mymax = maxOp(mymax, srcVal);
 684                 }
 685             }
 686         }
 687
 688         device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
 689
 690         GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
 691     }
 692
 693     const int threads_x = 32;
 694     const int threads_y = 8;
 695
 696     void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
 697     {
 698         block = dim3(threads_x, threads_y);
 699
 700         grid = dim3(divUp(cols, block.x * block.y),
 701                     divUp(rows, block.y * block.x));
 702
 703         grid.x = ::min(grid.x, block.x);
 704         grid.y = ::min(grid.y, block.y);
 705     }
 706
 707     void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
 708     {
 709         dim3 block, grid;
 710         getLaunchCfg(cols, rows, block, grid);
 711
 712         bufcols = grid.x * grid.y * sizeof(double);
 713         bufrows = 2;
 714     }
 715
 716     __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
 717     {
 718         *minval_buf = numeric_limits<int>::max();
 719         *maxval_buf = numeric_limits<int>::min();
 720     }
 721     __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
 722     {
 723         *minval_buf = numeric_limits<float>::max();
 724         *maxval_buf = -numeric_limits<float>::max();
 725     }
 726     __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
 727     {
 728         *minval_buf = numeric_limits<double>::max();
 729         *maxval_buf = -numeric_limits<double>::max();
 730     }
 731
 732     template <typename R>
 733     void setDefault(R* minval_buf, R* maxval_buf)
 734     {
 735         setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
 736     }
 737
 738     template <typename T>
 739     void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
 740     {
 741         typedef typename MinMaxTypeTraits<T>::best_type R;
 742
 743         dim3 block, grid;
 744         getLaunchCfg(src.cols, src.rows, block, grid);
 745
 746         const int twidth = divUp(divUp(src.cols, grid.x), block.x);
 747         const int theight = divUp(divUp(src.rows, grid.y), block.y);
 748
 749         R* minval_buf = (R*) buf.ptr(0);
 750         R* maxval_buf = (R*) buf.ptr(1);
 751
 752         setDefault(minval_buf, maxval_buf);
 753
 754         if (mask.data)
 755             kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
 756         else
 757             kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
 758
 759         cudaSafeCall( cudaGetLastError() );
 760
 761         cudaSafeCall( cudaDeviceSynchronize() );
 762
 763         R minval_, maxval_;
 764         cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
 765         cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
 766         *minval = minval_;
 767         *maxval = maxval_;
 768     }
 769
 770     template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 771     template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 772     template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 773     template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 774     template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 775     template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 776     template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 777 }
 778
 779 /////////////////////////////////////////////////////////////
 780 // minMaxLoc
 781
 782 namespace minMaxLoc
 783 {
 784     // To avoid shared bank conflicts we convert each value into value of
 785     // appropriate type (32 bits minimum)
 786     template <typename T> struct MinMaxTypeTraits;
 787     template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
 788     template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
 789     template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
 790     template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
 791     template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
 792     template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
 793     template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
 794
 795     template <int BLOCK_SIZE, typename T, class Mask>
 796     __global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
 797     {
 798         typedef typename MinMaxTypeTraits<T>::best_type work_type;
 799
 800         __shared__ work_type sminval[BLOCK_SIZE];
 801         __shared__ work_type smaxval[BLOCK_SIZE];
 802         __shared__ unsigned int sminloc[BLOCK_SIZE];
 803         __shared__ unsigned int smaxloc[BLOCK_SIZE];
 804
 805         const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
 806         const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
 807
 808         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 809         const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 810
 811         work_type mymin = numeric_limits<work_type>::max();
 812         work_type mymax = -numeric_limits<work_type>::max();
 813         unsigned int myminloc = 0;
 814         unsigned int mymaxloc = 0;
 815
 816         for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
 817         {
 818             const T* ptr = src.ptr(y);
 819
 820             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
 821             {
 822                 if (mask(y, x))
 823                 {
 824                     const work_type srcVal = ptr[x];
 825
 826                     if (srcVal < mymin)
 827                     {
 828                         mymin = srcVal;
 829                         myminloc = y * src.cols + x;
 830                     }
 831
 832                     if (srcVal > mymax)
 833                     {
 834                         mymax = srcVal;
 835                         mymaxloc = y * src.cols + x;
 836                     }
 837                 }
 838             }
 839         }
 840
 841         reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
 842                                  smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
 843                                  tid,
 844                                  thrust::make_tuple(less<work_type>(), greater<work_type>()));
 845
 846         if (tid == 0)
 847         {
 848             minval[bid] = (T) mymin;
 849             maxval[bid] = (T) mymax;
 850             minloc[bid] = myminloc;
 851             maxloc[bid] = mymaxloc;
 852         }
 853     }
 854     template <int BLOCK_SIZE, typename T>
 855     __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
 856     {
 857         typedef typename MinMaxTypeTraits<T>::best_type work_type;
 858
 859         __shared__ work_type sminval[BLOCK_SIZE];
 860         __shared__ work_type smaxval[BLOCK_SIZE];
 861         __shared__ unsigned int sminloc[BLOCK_SIZE];
 862         __shared__ unsigned int smaxloc[BLOCK_SIZE];
 863
 864         unsigned int idx = ::min(threadIdx.x, count - 1);
 865
 866         work_type mymin = minval[idx];
 867         work_type mymax = maxval[idx];
 868         unsigned int myminloc = minloc[idx];
 869         unsigned int mymaxloc = maxloc[idx];
 870
 871         reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
 872                                  smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
 873                                  threadIdx.x,
 874                                  thrust::make_tuple(less<work_type>(), greater<work_type>()));
 875
 876         if (threadIdx.x == 0)
 877         {
 878             minval[0] = (T) mymin;
 879             maxval[0] = (T) mymax;
 880             minloc[0] = myminloc;
 881             maxloc[0] = mymaxloc;
 882         }
 883     }
 884
 885     const int threads_x = 32;
 886     const int threads_y = 8;
 887
 888     void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
 889     {
 890         block = dim3(threads_x, threads_y);
 891
 892         grid = dim3(divUp(cols, block.x * block.y),
 893                     divUp(rows, block.y * block.x));
 894
 895         grid.x = ::min(grid.x, block.x);
 896         grid.y = ::min(grid.y, block.y);
 897     }
 898
 899     void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
 900     {
 901         dim3 block, grid;
 902         getLaunchCfg(cols, rows, block, grid);
 903
 904         // For values
 905         b1cols = (int)(grid.x * grid.y * elem_size);
 906         b1rows = 2;
 907
 908         // For locations
 909         b2cols = grid.x * grid.y * sizeof(int);
 910         b2rows = 2;
 911     }
 912
 913     template <typename T>
 914     void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
 915     {
 916         dim3 block, grid;
 917         getLaunchCfg(src.cols, src.rows, block, grid);
 918
 919         const int twidth = divUp(divUp(src.cols, grid.x), block.x);
 920         const int theight = divUp(divUp(src.rows, grid.y), block.y);
 921
 922         T* minval_buf = (T*) valbuf.ptr(0);
 923         T* maxval_buf = (T*) valbuf.ptr(1);
 924         unsigned int* minloc_buf = locbuf.ptr(0);
 925         unsigned int* maxloc_buf = locbuf.ptr(1);
 926
 927         if (mask.data)
 928             kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
 929         else
 930             kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
 931
 932         cudaSafeCall( cudaGetLastError() );
 933
 934         kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
 935         cudaSafeCall( cudaGetLastError() );
 936
 937         cudaSafeCall( cudaDeviceSynchronize() );
 938
 939         T minval_, maxval_;
 940         cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
 941         cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
 942         *minval = minval_;
 943         *maxval = maxval_;
 944
 945         unsigned int minloc_, maxloc_;
 946         cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 947         cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 948         minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
 949         maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
 950     }
 951
 952     template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 953     template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 954     template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 955     template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 956     template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 957     template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 958     template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 959 }
 960
 961 /////////////////////////////////////////////////////////////
 962 // countNonZero
 963
 964 namespace countNonZero
 965 {
 966     __device__ unsigned int blocks_finished = 0;
 967
 968     template <int BLOCK_SIZE, typename T>
 969     __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
 970     {
 971         __shared__ unsigned int scount[BLOCK_SIZE];
 972
 973         const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
 974         const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
 975
 976         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 977
 978         unsigned int mycount = 0;
 979
 980         for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
 981         {
 982             const T* ptr = src.ptr(y);
 983
 984             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
 985             {
 986                 const T srcVal = ptr[x];
 987
 988                 mycount += (srcVal != 0);
 989             }
 990         }
 991
 992         device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
 993
 994     #if __CUDA_ARCH__ >= 200
 995         if (tid == 0)
 996             ::atomicAdd(count, mycount);
 997     #else
 998         __shared__ bool is_last;
 999         const int bid = blockIdx.y * gridDim.x + blockIdx.x;
1000
1001         if (tid == 0)
1002         {
1003             count[bid] = mycount;
1004
1005             __threadfence();
1006
1007             unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
1008             is_last = (ticket == gridDim.x * gridDim.y - 1);
1009         }
1010
1011         __syncthreads();
1012
1013         if (is_last)
1014         {
1015             mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
1016
1017             device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
1018
1019             if (tid == 0)
1020             {
1021                 count[0] = mycount;
1022
1023                 blocks_finished = 0;
1024             }
1025         }
1026     #endif
1027     }
1028
1029     const int threads_x = 32;
1030     const int threads_y = 8;
1031
1032     void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
1033     {
1034         block = dim3(threads_x, threads_y);
1035
1036         grid = dim3(divUp(cols, block.x * block.y),
1037                     divUp(rows, block.y * block.x));
1038
1039         grid.x = ::min(grid.x, block.x);
1040         grid.y = ::min(grid.y, block.y);
1041     }
1042
1043     void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
1044     {
1045         dim3 block, grid;
1046         getLaunchCfg(cols, rows, block, grid);
1047
1048         bufcols = grid.x * grid.y * sizeof(int);
1049         bufrows = 1;
1050     }
1051
1052     template <typename T>
1053     int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
1054     {
1055         dim3 block, grid;
1056         getLaunchCfg(src.cols, src.rows, block, grid);
1057
1058         const int twidth = divUp(divUp(src.cols, grid.x), block.x);
1059         const int theight = divUp(divUp(src.rows, grid.y), block.y);
1060
1061         unsigned int* count_buf = buf.ptr(0);
1062
1063         cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
1064
1065         kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
1066         cudaSafeCall( cudaGetLastError() );
1067
1068         cudaSafeCall( cudaDeviceSynchronize() );
1069
1070         unsigned int count;
1071         cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
1072
1073         return count;
1074     }
1075
1076     template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1077     template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1078     template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
1079     template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1080     template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1081     template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
1082     template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
1083 }
1084
1085 //////////////////////////////////////////////////////////////////////////////
1086 // reduce
1087
1088 namespace reduce
1089 {
1090     struct Sum
1091     {
1092         template <typename T>
1093         __device__ __forceinline__ T startValue() const
1094         {
1095             return VecTraits<T>::all(0);
1096         }
1097
1098         template <typename T>
1099         __device__ __forceinline__ T operator ()(T a, T b) const
1100         {
1101             return a + b;
1102         }
1103
1104         template <typename T>
1105         __device__ __forceinline__ T result(T r, double) const
1106         {
1107             return r;
1108         }
1109
1110         __device__ __forceinline__ Sum() {}
1111         __device__ __forceinline__ Sum(const Sum&) {}
1112     };
1113
1114     struct Avg
1115     {
1116         template <typename T>
1117         __device__ __forceinline__ T startValue() const
1118         {
1119             return VecTraits<T>::all(0);
1120         }
1121
1122         template <typename T>
1123         __device__ __forceinline__ T operator ()(T a, T b) const
1124         {
1125             return a + b;
1126         }
1127
1128         template <typename T>
1129         __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
1130         {
1131             return r / sz;
1132         }
1133
1134         __device__ __forceinline__ Avg() {}
1135         __device__ __forceinline__ Avg(const Avg&) {}
1136     };
1137
1138     struct Min
1139     {
1140         template <typename T>
1141         __device__ __forceinline__ T startValue() const
1142         {
1143             return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
1144         }
1145
1146         template <typename T>
1147         __device__ __forceinline__ T operator ()(T a, T b) const
1148         {
1149             minimum<T> minOp;
1150             return minOp(a, b);
1151         }
1152
1153         template <typename T>
1154         __device__ __forceinline__ T result(T r, double) const
1155         {
1156             return r;
1157         }
1158
1159         __device__ __forceinline__ Min() {}
1160         __device__ __forceinline__ Min(const Min&) {}
1161     };
1162
1163     struct Max
1164     {
1165         template <typename T>
1166         __device__ __forceinline__ T startValue() const
1167         {
1168             return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
1169         }
1170
1171         template <typename T>
1172         __device__ __forceinline__ T operator ()(T a, T b) const
1173         {
1174             maximum<T> maxOp;
1175             return maxOp(a, b);
1176         }
1177
1178         template <typename T>
1179         __device__ __forceinline__ T result(T r, double) const
1180         {
1181             return r;
1182         }
1183
1184         __device__ __forceinline__ Max() {}
1185         __device__ __forceinline__ Max(const Max&) {}
1186     };
1187
1188     ///////////////////////////////////////////////////////////
1189
1190     template <typename T, typename S, typename D, class Op>
1191     __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
1192     {
1193         __shared__ S smem[16 * 16];
1194
1195         const int x = blockIdx.x * 16 + threadIdx.x;
1196
1197         S myVal = op.template startValue<S>();
1198
1199         if (x < src.cols)
1200         {
1201             for (int y = threadIdx.y; y < src.rows; y += 16)
1202             {
1203                 S srcVal = src(y, x);
1204                 myVal = op(myVal, srcVal);
1205             }
1206         }
1207
1208         smem[threadIdx.x * 16 + threadIdx.y] = myVal;
1209
1210         __syncthreads();
1211
1212         volatile S* srow = smem + threadIdx.y * 16;
1213
1214         myVal = srow[threadIdx.x];
1215         device::reduce<16>(srow, myVal, threadIdx.x, op);
1216
1217         if (threadIdx.x == 0)
1218             srow[0] = myVal;
1219
1220         __syncthreads();
1221
1222         if (threadIdx.y == 0 && x < src.cols)
1223             dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
1224     }
1225
1226     template <typename T, typename S, typename D, class Op>
1227     void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
1228     {
1229         const dim3 block(16, 16);
1230         const dim3 grid(divUp(src.cols, block.x));
1231
1232         Op op;
1233         rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
1234         cudaSafeCall( cudaGetLastError() );
1235
1236         if (stream == 0)
1237             cudaSafeCall( cudaDeviceSynchronize() );
1238     }
1239
1240     template <typename T, typename S, typename D>
1241     void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
1242     {
1243         typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
1244         static const func_t funcs[] =
1245         {
1246             rowsCaller<T, S, D, Sum>,
1247             rowsCaller<T, S, D, Avg>,
1248             rowsCaller<T, S, D, Max>,
1249             rowsCaller<T, S, D, Min>
1250         };
1251
1252         funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
1253     }
1254
1255     template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1256     template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1257     template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1258     template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1259
1260     template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1261     template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1262     template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1263     template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1264
1265     template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1266     template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1267     template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1268     template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1269
1270     template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1271     template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1272     template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1273
1274     template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1275     template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1276
1277     template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
1278
1279     ///////////////////////////////////////////////////////////
1280
1281     template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
1282     __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
1283     {
1284         typedef typename TypeVec<T, cn>::vec_type src_type;
1285         typedef typename TypeVec<S, cn>::vec_type work_type;
1286         typedef typename TypeVec<D, cn>::vec_type dst_type;
1287
1288         __shared__ S smem[BLOCK_SIZE * cn];
1289
1290         const int y = blockIdx.x;
1291
1292         const src_type* srcRow = src.ptr(y);
1293
1294         work_type myVal = op.template startValue<work_type>();
1295
1296         for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
1297             myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
1298
1299         device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
1300
1301         if (threadIdx.x == 0)
1302             dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
1303     }
1304
1305     template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
1306     {
1307         const int BLOCK_SIZE = 256;
1308
1309         const dim3 block(BLOCK_SIZE);
1310         const dim3 grid(src.rows);
1311
1312         Op op;
1313         colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
1314         cudaSafeCall( cudaGetLastError() );
1315
1316         if (stream == 0)
1317             cudaSafeCall( cudaDeviceSynchronize() );
1318
1319     }
1320
1321     template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
1322     {
1323         typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
1324         static const func_t funcs[5][4] =
1325         {
1326             {0,0,0,0},
1327             {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
1328             {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
1329             {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
1330             {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
1331         };
1332
1333         funcs[cn][op](src, dst, stream);
1334     }
1335
1336     template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1337     template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1338     template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1339     template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1340
1341     template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1342     template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1343     template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1344     template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1345
1346     template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1347     template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1348     template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1349     template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1350
1351     template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1352     template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1353     template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1354
1355     template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1356     template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1357
1358     template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
1359 }
1360
1361 #endif /* CUDA_DISABLER */