modules/gpu/src/cuda/split_merge.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////\r
   2 //\r
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
   4 //\r
   5 //  By downloading, copying, installing or using the software you agree to this license.\r
   6 //  If you do not agree to this license, do not download, install,\r
   7 //  copy or use the software.\r
   8 //\r
   9 //\r
  10 //                           License Agreement\r
  11 //                For Open Source Computer Vision Library\r
  12 //\r
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
  15 // Third party copyrights are property of their respective owners.\r
  16 //\r
  17 // Redistribution and use in source and binary forms, with or without modification,\r
  18 // are permitted provided that the following conditions are met:\r
  19 //\r
  20 //   * Redistribution's of source code must retain the above copyright notice,\r
  21 //     this list of conditions and the following disclaimer.\r
  22 //\r
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,\r
  24 //     this list of conditions and the following disclaimer in the documentation\r
  25 //     and/or other GpuMaterials provided with the distribution.\r
  26 //\r
  27 //   * The name of the copyright holders may not be used to endorse or promote products\r
  28 //     derived from this software without specific prior written permission.\r
  29 //\r
  30 // This software is provided by the copyright holders and contributors "as is" and\r
  31 // any express or bpied warranties, including, but not limited to, the bpied\r
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.\r
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,\r
  34 // indirect, incidental, special, exemplary, or consequential damages\r
  35 // (including, but not limited to, procurement of substitute goods or services;\r
  36 // loss of use, data, or profits; or business interruption) however caused\r
  37 // and on any theory of liability, whether in contract, strict liability,\r
  38 // or tort (including negligence or otherwise) arising in any way out of\r
  39 // the use of this software, even if advised of the possibility of such damage.\r
  40 //\r
  41 //M*/\r
  42 \r
  43 #include "internal_shared.hpp"\r
  44 \r
  45 namespace cv { namespace gpu { namespace device \r
  46 {\r
  47     namespace split_merge \r
  48     {\r
  49         template <typename T, size_t elem_size = sizeof(T)>\r
  50         struct TypeTraits \r
  51         {\r
  52             typedef T type;\r
  53             typedef T type2;\r
  54             typedef T type3;\r
  55             typedef T type4;\r
  56         };\r
  57 \r
  58         template <typename T>\r
  59         struct TypeTraits<T, 1>\r
  60         {\r
  61             typedef char type;\r
  62             typedef char2 type2;\r
  63             typedef char3 type3;\r
  64             typedef char4 type4;\r
  65         };\r
  66 \r
  67         template <typename T>\r
  68         struct TypeTraits<T, 2>\r
  69         {\r
  70             typedef short type;\r
  71             typedef short2 type2;\r
  72             typedef short3 type3;\r
  73             typedef short4 type4;\r
  74         };\r
  75 \r
  76         template <typename T>\r
  77         struct TypeTraits<T, 4> \r
  78         {\r
  79             typedef int type;\r
  80             typedef int2 type2;\r
  81             typedef int3 type3;\r
  82             typedef int4 type4;\r
  83         };\r
  84 \r
  85         template <typename T>\r
  86         struct TypeTraits<T, 8> \r
  87         {\r
  88             typedef double type;\r
  89             typedef double2 type2;\r
  90             //typedef double3 type3;\r
  91             //typedef double4 type3;\r
  92         };\r
  93 \r
  94         typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);\r
  95         typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
  96 \r
  97         //------------------------------------------------------------\r
  98         // Merge    \r
  99 \r
 100         template <typename T>\r
 101         __global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
 102                                  const uchar* src1, size_t src1_step, \r
 103                                  int rows, int cols, uchar* dst, size_t dst_step)\r
 104         {\r
 105             typedef typename TypeTraits<T>::type2 dst_type;\r
 106 \r
 107             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 108             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 109 \r
 110             const T* src0_y = (const T*)(src0 + y * src0_step);\r
 111             const T* src1_y = (const T*)(src1 + y * src1_step);\r
 112             dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
 113 \r
 114             if (x < cols && y < rows) \r
 115             {                        \r
 116                 dst_type dst_elem;\r
 117                 dst_elem.x = src0_y[x];\r
 118                 dst_elem.y = src1_y[x];\r
 119                 dst_y[x] = dst_elem;\r
 120             }\r
 121         }\r
 122 \r
 123 \r
 124         template <typename T>\r
 125         __global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
 126                                  const uchar* src1, size_t src1_step, \r
 127                                  const uchar* src2, size_t src2_step, \r
 128                                  int rows, int cols, uchar* dst, size_t dst_step)\r
 129         {\r
 130             typedef typename TypeTraits<T>::type3 dst_type;\r
 131 \r
 132             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 133             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 134 \r
 135             const T* src0_y = (const T*)(src0 + y * src0_step);\r
 136             const T* src1_y = (const T*)(src1 + y * src1_step);\r
 137             const T* src2_y = (const T*)(src2 + y * src2_step);\r
 138             dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
 139 \r
 140             if (x < cols && y < rows) \r
 141             {                        \r
 142                 dst_type dst_elem;\r
 143                 dst_elem.x = src0_y[x];\r
 144                 dst_elem.y = src1_y[x];\r
 145                 dst_elem.z = src2_y[x];\r
 146                 dst_y[x] = dst_elem;\r
 147             }\r
 148         }\r
 149 \r
 150 \r
 151         template <>\r
 152         __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
 153                                  const uchar* src1, size_t src1_step, \r
 154                                  const uchar* src2, size_t src2_step, \r
 155                                  int rows, int cols, uchar* dst, size_t dst_step)\r
 156         {\r
 157             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 158             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 159 \r
 160             const double* src0_y = (const double*)(src0 + y * src0_step);\r
 161             const double* src1_y = (const double*)(src1 + y * src1_step);\r
 162             const double* src2_y = (const double*)(src2 + y * src2_step);\r
 163             double* dst_y = (double*)(dst + y * dst_step);\r
 164 \r
 165             if (x < cols && y < rows) \r
 166             {                        \r
 167                 dst_y[3 * x] = src0_y[x];\r
 168                 dst_y[3 * x + 1] = src1_y[x];\r
 169                 dst_y[3 * x + 2] = src2_y[x];\r
 170             }\r
 171         }\r
 172 \r
 173 \r
 174         template <typename T>\r
 175         __global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
 176                                  const uchar* src1, size_t src1_step, \r
 177                                  const uchar* src2, size_t src2_step, \r
 178                                  const uchar* src3, size_t src3_step, \r
 179                                  int rows, int cols, uchar* dst, size_t dst_step)\r
 180         {\r
 181             typedef typename TypeTraits<T>::type4 dst_type;\r
 182 \r
 183             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 184             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 185 \r
 186             const T* src0_y = (const T*)(src0 + y * src0_step);\r
 187             const T* src1_y = (const T*)(src1 + y * src1_step);\r
 188             const T* src2_y = (const T*)(src2 + y * src2_step);\r
 189             const T* src3_y = (const T*)(src3 + y * src3_step);\r
 190             dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
 191 \r
 192             if (x < cols && y < rows) \r
 193             {                        \r
 194                 dst_type dst_elem;\r
 195                 dst_elem.x = src0_y[x];\r
 196                 dst_elem.y = src1_y[x];\r
 197                 dst_elem.z = src2_y[x];\r
 198                 dst_elem.w = src3_y[x];\r
 199                 dst_y[x] = dst_elem;\r
 200             }\r
 201         }\r
 202 \r
 203 \r
 204         template <>\r
 205         __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
 206                                  const uchar* src1, size_t src1_step, \r
 207                                  const uchar* src2, size_t src2_step, \r
 208                                  const uchar* src3, size_t src3_step, \r
 209                                  int rows, int cols, uchar* dst, size_t dst_step)\r
 210         {\r
 211             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 212             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 213 \r
 214             const double* src0_y = (const double*)(src0 + y * src0_step);\r
 215             const double* src1_y = (const double*)(src1 + y * src1_step);\r
 216             const double* src2_y = (const double*)(src2 + y * src2_step);\r
 217             const double* src3_y = (const double*)(src3 + y * src3_step);\r
 218             double2* dst_y = (double2*)(dst + y * dst_step);\r
 219 \r
 220             if (x < cols && y < rows) \r
 221             {                        \r
 222                 dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
 223                 dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
 224             }\r
 225         }\r
 226 \r
 227 \r
 228         template <typename T>\r
 229         static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
 230         {\r
 231             dim3 blockDim(32, 8);\r
 232             dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
 233             mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
 234                     src[0].data, src[0].step,\r
 235                     src[1].data, src[1].step,\r
 236                     dst.rows, dst.cols, dst.data, dst.step);\r
 237             cudaSafeCall( cudaGetLastError() );\r
 238 \r
 239             if (stream == 0)\r
 240                 cudaSafeCall(cudaDeviceSynchronize());\r
 241         }\r
 242 \r
 243 \r
 244         template <typename T>\r
 245         static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
 246         {\r
 247             dim3 blockDim(32, 8);\r
 248             dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
 249             mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
 250                     src[0].data, src[0].step,\r
 251                     src[1].data, src[1].step,\r
 252                     src[2].data, src[2].step,\r
 253                     dst.rows, dst.cols, dst.data, dst.step);\r
 254             cudaSafeCall( cudaGetLastError() );\r
 255 \r
 256             if (stream == 0)\r
 257                 cudaSafeCall(cudaDeviceSynchronize());\r
 258         }\r
 259 \r
 260 \r
 261         template <typename T>\r
 262         static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
 263         {\r
 264             dim3 blockDim(32, 8);\r
 265             dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
 266             mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
 267                     src[0].data, src[0].step,\r
 268                     src[1].data, src[1].step,\r
 269                     src[2].data, src[2].step,\r
 270                     src[3].data, src[3].step,\r
 271                     dst.rows, dst.cols, dst.data, dst.step);\r
 272             cudaSafeCall( cudaGetLastError() );\r
 273 \r
 274             if (stream == 0)\r
 275                 cudaSafeCall(cudaDeviceSynchronize());\r
 276         }\r
 277 \r
 278 \r
 279         void merge_caller(const DevMem2Db* src, DevMem2Db& dst,\r
 280                                      int total_channels, size_t elem_size,\r
 281                                      const cudaStream_t& stream)\r
 282         {\r
 283             static MergeFunction merge_func_tbl[] =\r
 284             {\r
 285                 mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
 286                 mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
 287                 mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
 288             };\r
 289 \r
 290             size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
 291             MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
 292 \r
 293             if (merge_func == 0)\r
 294                 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "merge_caller");\r
 295 \r
 296             merge_func(src, dst, stream);\r
 297         }\r
 298 \r
 299 \r
 300 \r
 301         //------------------------------------------------------------\r
 302         // Split\r
 303 \r
 304 \r
 305         template <typename T>\r
 306         __global__ void splitC2_(const uchar* src, size_t src_step, \r
 307                                 int rows, int cols,\r
 308                                 uchar* dst0, size_t dst0_step,\r
 309                                 uchar* dst1, size_t dst1_step)\r
 310         {\r
 311             typedef typename TypeTraits<T>::type2 src_type;\r
 312 \r
 313             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 314             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 315 \r
 316             const src_type* src_y = (const src_type*)(src + y * src_step);\r
 317             T* dst0_y = (T*)(dst0 + y * dst0_step);\r
 318             T* dst1_y = (T*)(dst1 + y * dst1_step);\r
 319 \r
 320             if (x < cols && y < rows) \r
 321             {\r
 322                 src_type src_elem = src_y[x];\r
 323                 dst0_y[x] = src_elem.x;\r
 324                 dst1_y[x] = src_elem.y;\r
 325             }\r
 326         }\r
 327 \r
 328 \r
 329         template <typename T>\r
 330         __global__ void splitC3_(const uchar* src, size_t src_step, \r
 331                                 int rows, int cols,\r
 332                                 uchar* dst0, size_t dst0_step,\r
 333                                 uchar* dst1, size_t dst1_step,\r
 334                                 uchar* dst2, size_t dst2_step)\r
 335         {\r
 336             typedef typename TypeTraits<T>::type3 src_type;\r
 337 \r
 338             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 339             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 340 \r
 341             const src_type* src_y = (const src_type*)(src + y * src_step);\r
 342             T* dst0_y = (T*)(dst0 + y * dst0_step);\r
 343             T* dst1_y = (T*)(dst1 + y * dst1_step);\r
 344             T* dst2_y = (T*)(dst2 + y * dst2_step);\r
 345 \r
 346             if (x < cols && y < rows) \r
 347             {\r
 348                 src_type src_elem = src_y[x];\r
 349                 dst0_y[x] = src_elem.x;\r
 350                 dst1_y[x] = src_elem.y;\r
 351                 dst2_y[x] = src_elem.z;\r
 352             }\r
 353         }\r
 354 \r
 355 \r
 356         template <>\r
 357         __global__ void splitC3_<double>(\r
 358                 const uchar* src, size_t src_step, int rows, int cols,\r
 359                 uchar* dst0, size_t dst0_step,\r
 360                 uchar* dst1, size_t dst1_step,\r
 361                 uchar* dst2, size_t dst2_step)\r
 362         {\r
 363             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 364             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 365 \r
 366             const double* src_y = (const double*)(src + y * src_step);\r
 367             double* dst0_y = (double*)(dst0 + y * dst0_step);\r
 368             double* dst1_y = (double*)(dst1 + y * dst1_step);\r
 369             double* dst2_y = (double*)(dst2 + y * dst2_step);\r
 370 \r
 371             if (x < cols && y < rows) \r
 372             {\r
 373                 dst0_y[x] = src_y[3 * x];\r
 374                 dst1_y[x] = src_y[3 * x + 1];\r
 375                 dst2_y[x] = src_y[3 * x + 2];\r
 376             }\r
 377         }\r
 378 \r
 379 \r
 380         template <typename T>\r
 381         __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
 382                                 uchar* dst0, size_t dst0_step,\r
 383                                 uchar* dst1, size_t dst1_step,\r
 384                                 uchar* dst2, size_t dst2_step,\r
 385                                 uchar* dst3, size_t dst3_step)\r
 386         {\r
 387             typedef typename TypeTraits<T>::type4 src_type;\r
 388 \r
 389             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 390             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 391 \r
 392             const src_type* src_y = (const src_type*)(src + y * src_step);\r
 393             T* dst0_y = (T*)(dst0 + y * dst0_step);\r
 394             T* dst1_y = (T*)(dst1 + y * dst1_step);\r
 395             T* dst2_y = (T*)(dst2 + y * dst2_step);\r
 396             T* dst3_y = (T*)(dst3 + y * dst3_step);\r
 397 \r
 398             if (x < cols && y < rows) \r
 399             {\r
 400                 src_type src_elem = src_y[x];\r
 401                 dst0_y[x] = src_elem.x;\r
 402                 dst1_y[x] = src_elem.y;\r
 403                 dst2_y[x] = src_elem.z;\r
 404                 dst3_y[x] = src_elem.w;\r
 405             }\r
 406         }\r
 407 \r
 408 \r
 409         template <>\r
 410         __global__ void splitC4_<double>(\r
 411                 const uchar* src, size_t src_step, int rows, int cols,\r
 412                 uchar* dst0, size_t dst0_step,\r
 413                 uchar* dst1, size_t dst1_step,\r
 414                 uchar* dst2, size_t dst2_step,\r
 415                 uchar* dst3, size_t dst3_step)\r
 416         {\r
 417             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 418             const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 419 \r
 420             const double2* src_y = (const double2*)(src + y * src_step);\r
 421             double* dst0_y = (double*)(dst0 + y * dst0_step);\r
 422             double* dst1_y = (double*)(dst1 + y * dst1_step);\r
 423             double* dst2_y = (double*)(dst2 + y * dst2_step);\r
 424             double* dst3_y = (double*)(dst3 + y * dst3_step);\r
 425 \r
 426             if (x < cols && y < rows) \r
 427             {\r
 428                 double2 src_elem1 = src_y[2 * x];\r
 429                 double2 src_elem2 = src_y[2 * x + 1];\r
 430                 dst0_y[x] = src_elem1.x;\r
 431                 dst1_y[x] = src_elem1.y;\r
 432                 dst2_y[x] = src_elem2.x;\r
 433                 dst3_y[x] = src_elem2.y;\r
 434             }\r
 435         }\r
 436 \r
 437         template <typename T>\r
 438         static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
 439         {\r
 440             dim3 blockDim(32, 8);\r
 441             dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
 442             splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
 443                     src.data, src.step, src.rows, src.cols,\r
 444                     dst[0].data, dst[0].step,\r
 445                     dst[1].data, dst[1].step);\r
 446             cudaSafeCall( cudaGetLastError() );\r
 447 \r
 448             if (stream == 0)\r
 449                 cudaSafeCall(cudaDeviceSynchronize());\r
 450         }\r
 451 \r
 452 \r
 453         template <typename T>\r
 454         static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
 455         {\r
 456             dim3 blockDim(32, 8);\r
 457             dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
 458             splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
 459                     src.data, src.step, src.rows, src.cols,\r
 460                     dst[0].data, dst[0].step,\r
 461                     dst[1].data, dst[1].step,\r
 462                     dst[2].data, dst[2].step);\r
 463             cudaSafeCall( cudaGetLastError() );\r
 464 \r
 465             if (stream == 0)\r
 466                 cudaSafeCall(cudaDeviceSynchronize());\r
 467         }\r
 468 \r
 469 \r
 470         template <typename T>\r
 471         static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
 472         {\r
 473             dim3 blockDim(32, 8);\r
 474             dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
 475             splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
 476                      src.data, src.step, src.rows, src.cols,\r
 477                      dst[0].data, dst[0].step,\r
 478                      dst[1].data, dst[1].step,\r
 479                      dst[2].data, dst[2].step,\r
 480                      dst[3].data, dst[3].step);\r
 481             cudaSafeCall( cudaGetLastError() );\r
 482 \r
 483             if (stream == 0)\r
 484                 cudaSafeCall(cudaDeviceSynchronize());\r
 485         }\r
 486 \r
 487 \r
 488         void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)\r
 489         {\r
 490             static SplitFunction split_func_tbl[] =\r
 491             {\r
 492                 splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
 493                 splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
 494                 splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
 495             };\r
 496 \r
 497             size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
 498             SplitFunction split_func = split_func_tbl[split_func_id];\r
 499 \r
 500             if (split_func == 0)\r
 501                 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "split_caller");\r
 502 \r
 503             split_func(src, dst, stream);\r
 504         }\r
 505     } // namespace split_merge\r
 506 }}} // namespace cv { namespace gpu { namespace device\r