modules/dynamicuda/src/cuda/matrix_operations.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "opencv2/gpu/device/saturate_cast.hpp"
  44 #include "opencv2/gpu/device/transform.hpp"
  45 #include "opencv2/gpu/device/functional.hpp"
  46 #include "opencv2/gpu/device/type_traits.hpp"
  47
  48 namespace cv { namespace gpu { namespace device
  49 {
  50     void writeScalar(const uchar*);
  51     void writeScalar(const schar*);
  52     void writeScalar(const ushort*);
  53     void writeScalar(const short int*);
  54     void writeScalar(const int*);
  55     void writeScalar(const float*);
  56     void writeScalar(const double*);
  57     void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
  58     void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
  59 }}}
  60
  61 namespace cv { namespace gpu { namespace device
  62 {
  63     template <typename T> struct shift_and_sizeof;
  64     template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
  65     template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
  66     template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
  67     template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
  68     template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
  69     template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
  70     template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
  71
  72     ///////////////////////////////////////////////////////////////////////////
  73     ////////////////////////////////// CopyTo /////////////////////////////////
  74     ///////////////////////////////////////////////////////////////////////////
  75
  76     template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
  77     {
  78         if (colorMask)
  79             cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
  80         else
  81             cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
  82     }
  83
  84     void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
  85     {
  86         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
  87
  88         static func_t tab[] =
  89         {
  90             0,
  91             copyToWithMask<unsigned char>,
  92             copyToWithMask<unsigned short>,
  93             0,
  94             copyToWithMask<int>,
  95             0,
  96             0,
  97             0,
  98             copyToWithMask<double>
  99         };
 100
 101         tab[elemSize1](src, dst, cn, mask, colorMask, stream);
 102     }
 103
 104     ///////////////////////////////////////////////////////////////////////////
 105     ////////////////////////////////// SetTo //////////////////////////////////
 106     ///////////////////////////////////////////////////////////////////////////
 107
 108     __constant__ uchar scalar_8u[4];
 109     __constant__ schar scalar_8s[4];
 110     __constant__ ushort scalar_16u[4];
 111     __constant__ short scalar_16s[4];
 112     __constant__ int scalar_32s[4];
 113     __constant__ float scalar_32f[4];
 114     __constant__ double scalar_64f[4];
 115
 116     template <typename T> __device__ __forceinline__ T readScalar(int i);
 117     template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
 118     template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
 119     template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
 120     template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
 121     template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
 122     template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
 123     template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
 124
 125     void writeScalar(const uchar* vals)
 126     {
 127         cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
 128     }
 129     void writeScalar(const schar* vals)
 130     {
 131         cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
 132     }
 133     void writeScalar(const ushort* vals)
 134     {
 135         cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
 136     }
 137     void writeScalar(const short* vals)
 138     {
 139         cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
 140     }
 141     void writeScalar(const int* vals)
 142     {
 143         cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
 144     }
 145     void writeScalar(const float* vals)
 146     {
 147         cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
 148     }
 149     void writeScalar(const double* vals)
 150     {
 151         cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
 152     }
 153
 154     template<typename T>
 155     __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
 156     {
 157         size_t x = blockIdx.x * blockDim.x + threadIdx.x;
 158         size_t y = blockIdx.y * blockDim.y + threadIdx.y;
 159
 160         if ((x < cols * channels ) && (y < rows))
 161         {
 162             size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
 163             mat[idx] = readScalar<T>(x % channels);
 164         }
 165     }
 166
 167     template<typename T>
 168     __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
 169     {
 170         size_t x = blockIdx.x * blockDim.x + threadIdx.x;
 171         size_t y = blockIdx.y * blockDim.y + threadIdx.y;
 172
 173         if ((x < cols * channels ) && (y < rows))
 174             if (mask[y * step_mask + x / channels] != 0)
 175             {
 176                 size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
 177                 mat[idx] = readScalar<T>(x % channels);
 178             }
 179     }
 180     template <typename T>
 181     void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
 182     {
 183         writeScalar(scalar);
 184
 185         dim3 threadsPerBlock(32, 8, 1);
 186         dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
 187
 188         set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
 189         cudaSafeCall( cudaGetLastError() );
 190
 191         if (stream == 0)
 192             cudaSafeCall ( cudaDeviceSynchronize() );
 193     }
 194
 195     template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 196     template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 197     template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 198     template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 199     template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 200     template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 201     template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 202
 203     template <typename T>
 204     void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream)
 205     {
 206         writeScalar(scalar);
 207
 208         dim3 threadsPerBlock(32, 8, 1);
 209         dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
 210
 211         set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
 212         cudaSafeCall( cudaGetLastError() );
 213
 214         if (stream == 0)
 215             cudaSafeCall ( cudaDeviceSynchronize() );
 216     }
 217
 218     template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, int channels, cudaStream_t stream);
 219     template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, int channels, cudaStream_t stream);
 220     template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream);
 221     template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, int channels, cudaStream_t stream);
 222     template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, int channels, cudaStream_t stream);
 223     template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, int channels, cudaStream_t stream);
 224     template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream);
 225
 226     ///////////////////////////////////////////////////////////////////////////
 227     //////////////////////////////// ConvertTo ////////////////////////////////
 228     ///////////////////////////////////////////////////////////////////////////
 229
 230     template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
 231     {
 232         Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
 233
 234         __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
 235         {
 236             return saturate_cast<D>(alpha * src + beta);
 237         }
 238
 239         S alpha, beta;
 240     };
 241
 242     namespace detail
 243     {
 244         template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
 245         {
 246         };
 247         template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
 248         {
 249             enum { smart_shift = 8 };
 250         };
 251         template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
 252         {
 253             enum { smart_shift = 4 };
 254         };
 255         template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
 256         {
 257             enum { smart_block_dim_y = 8 };
 258             enum { smart_shift = 4 };
 259         };
 260
 261         template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
 262         {
 263             enum { smart_shift = 4 };
 264         };
 265         template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
 266         {
 267             enum { smart_shift = 2 };
 268         };
 269
 270         template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
 271         {
 272             enum { smart_block_dim_y = 8 };
 273             enum { smart_shift = 4 };
 274         };
 275         template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
 276         {
 277             enum { smart_block_dim_y = 8 };
 278             enum { smart_shift = 2 };
 279         };
 280
 281         template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
 282         {
 283         };
 284     }
 285
 286     template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
 287     {
 288     };
 289
 290     template<typename T, typename D, typename S>
 291     void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
 292     {
 293         cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
 294         cudaSafeCall( cudaSetDoubleForDevice(&beta) );
 295         Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
 296         cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
 297     }
 298
 299 #if defined  __clang__
 300 # pragma clang diagnostic push
 301 # pragma clang diagnostic ignored "-Wmissing-declarations"
 302 #endif
 303
 304     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
 305     {
 306         typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
 307
 308         static const caller_t tab[7][7] =
 309         {
 310             {
 311                 cvt_<uchar, uchar, float>,
 312                 cvt_<uchar, schar, float>,
 313                 cvt_<uchar, ushort, float>,
 314                 cvt_<uchar, short, float>,
 315                 cvt_<uchar, int, float>,
 316                 cvt_<uchar, float, float>,
 317                 cvt_<uchar, double, double>
 318             },
 319             {
 320                 cvt_<schar, uchar, float>,
 321                 cvt_<schar, schar, float>,
 322                 cvt_<schar, ushort, float>,
 323                 cvt_<schar, short, float>,
 324                 cvt_<schar, int, float>,
 325                 cvt_<schar, float, float>,
 326                 cvt_<schar, double, double>
 327             },
 328             {
 329                 cvt_<ushort, uchar, float>,
 330                 cvt_<ushort, schar, float>,
 331                 cvt_<ushort, ushort, float>,
 332                 cvt_<ushort, short, float>,
 333                 cvt_<ushort, int, float>,
 334                 cvt_<ushort, float, float>,
 335                 cvt_<ushort, double, double>
 336             },
 337             {
 338                 cvt_<short, uchar, float>,
 339                 cvt_<short, schar, float>,
 340                 cvt_<short, ushort, float>,
 341                 cvt_<short, short, float>,
 342                 cvt_<short, int, float>,
 343                 cvt_<short, float, float>,
 344                 cvt_<short, double, double>
 345             },
 346             {
 347                 cvt_<int, uchar, float>,
 348                 cvt_<int, schar, float>,
 349                 cvt_<int, ushort, float>,
 350                 cvt_<int, short, float>,
 351                 cvt_<int, int, double>,
 352                 cvt_<int, float, double>,
 353                 cvt_<int, double, double>
 354             },
 355             {
 356                 cvt_<float, uchar, float>,
 357                 cvt_<float, schar, float>,
 358                 cvt_<float, ushort, float>,
 359                 cvt_<float, short, float>,
 360                 cvt_<float, int, float>,
 361                 cvt_<float, float, float>,
 362                 cvt_<float, double, double>
 363             },
 364             {
 365                 cvt_<double, uchar, double>,
 366                 cvt_<double, schar, double>,
 367                 cvt_<double, ushort, double>,
 368                 cvt_<double, short, double>,
 369                 cvt_<double, int, double>,
 370                 cvt_<double, float, double>,
 371                 cvt_<double, double, double>
 372             }
 373         };
 374
 375         caller_t func = tab[sdepth][ddepth];
 376         func(src, dst, alpha, beta, stream);
 377     }
 378
 379 #if defined __clang__
 380 # pragma clang diagnostic pop
 381 #endif
 382 }}} // namespace cv { namespace gpu { namespace device