modules/gpu/src/cuda/split_merge.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other GpuMaterials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "internal_shared.hpp"
  46
  47 namespace cv { namespace gpu { namespace device
  48 {
  49     namespace split_merge
  50     {
  51         template <typename T, size_t elem_size = sizeof(T)>
  52         struct TypeTraits
  53         {
  54             typedef T type;
  55             typedef T type2;
  56             typedef T type3;
  57             typedef T type4;
  58         };
  59
  60         template <typename T>
  61         struct TypeTraits<T, 1>
  62         {
  63             typedef char type;
  64             typedef char2 type2;
  65             typedef char3 type3;
  66             typedef char4 type4;
  67         };
  68
  69         template <typename T>
  70         struct TypeTraits<T, 2>
  71         {
  72             typedef short type;
  73             typedef short2 type2;
  74             typedef short3 type3;
  75             typedef short4 type4;
  76         };
  77
  78         template <typename T>
  79         struct TypeTraits<T, 4>
  80         {
  81             typedef int type;
  82             typedef int2 type2;
  83             typedef int3 type3;
  84             typedef int4 type4;
  85         };
  86
  87         template <typename T>
  88         struct TypeTraits<T, 8>
  89         {
  90             typedef double type;
  91             typedef double2 type2;
  92             //typedef double3 type3;
  93             //typedef double4 type3;
  94         };
  95
  96         typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
  97         typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
  98
  99         //------------------------------------------------------------
 100         // Merge
 101
 102         template <typename T>
 103         __global__ void mergeC2_(const uchar* src0, size_t src0_step,
 104                                  const uchar* src1, size_t src1_step,
 105                                  int rows, int cols, uchar* dst, size_t dst_step)
 106         {
 107             typedef typename TypeTraits<T>::type2 dst_type;
 108
 109             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 110             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 111
 112             const T* src0_y = (const T*)(src0 + y * src0_step);
 113             const T* src1_y = (const T*)(src1 + y * src1_step);
 114             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 115
 116             if (x < cols && y < rows)
 117             {
 118                 dst_type dst_elem;
 119                 dst_elem.x = src0_y[x];
 120                 dst_elem.y = src1_y[x];
 121                 dst_y[x] = dst_elem;
 122             }
 123         }
 124
 125
 126         template <typename T>
 127         __global__ void mergeC3_(const uchar* src0, size_t src0_step,
 128                                  const uchar* src1, size_t src1_step,
 129                                  const uchar* src2, size_t src2_step,
 130                                  int rows, int cols, uchar* dst, size_t dst_step)
 131         {
 132             typedef typename TypeTraits<T>::type3 dst_type;
 133
 134             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 135             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 136
 137             const T* src0_y = (const T*)(src0 + y * src0_step);
 138             const T* src1_y = (const T*)(src1 + y * src1_step);
 139             const T* src2_y = (const T*)(src2 + y * src2_step);
 140             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 141
 142             if (x < cols && y < rows)
 143             {
 144                 dst_type dst_elem;
 145                 dst_elem.x = src0_y[x];
 146                 dst_elem.y = src1_y[x];
 147                 dst_elem.z = src2_y[x];
 148                 dst_y[x] = dst_elem;
 149             }
 150         }
 151
 152
 153         template <>
 154         __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
 155                                  const uchar* src1, size_t src1_step,
 156                                  const uchar* src2, size_t src2_step,
 157                                  int rows, int cols, uchar* dst, size_t dst_step)
 158         {
 159             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 160             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 161
 162             const double* src0_y = (const double*)(src0 + y * src0_step);
 163             const double* src1_y = (const double*)(src1 + y * src1_step);
 164             const double* src2_y = (const double*)(src2 + y * src2_step);
 165             double* dst_y = (double*)(dst + y * dst_step);
 166
 167             if (x < cols && y < rows)
 168             {
 169                 dst_y[3 * x] = src0_y[x];
 170                 dst_y[3 * x + 1] = src1_y[x];
 171                 dst_y[3 * x + 2] = src2_y[x];
 172             }
 173         }
 174
 175
 176         template <typename T>
 177         __global__ void mergeC4_(const uchar* src0, size_t src0_step,
 178                                  const uchar* src1, size_t src1_step,
 179                                  const uchar* src2, size_t src2_step,
 180                                  const uchar* src3, size_t src3_step,
 181                                  int rows, int cols, uchar* dst, size_t dst_step)
 182         {
 183             typedef typename TypeTraits<T>::type4 dst_type;
 184
 185             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 186             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 187
 188             const T* src0_y = (const T*)(src0 + y * src0_step);
 189             const T* src1_y = (const T*)(src1 + y * src1_step);
 190             const T* src2_y = (const T*)(src2 + y * src2_step);
 191             const T* src3_y = (const T*)(src3 + y * src3_step);
 192             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 193
 194             if (x < cols && y < rows)
 195             {
 196                 dst_type dst_elem;
 197                 dst_elem.x = src0_y[x];
 198                 dst_elem.y = src1_y[x];
 199                 dst_elem.z = src2_y[x];
 200                 dst_elem.w = src3_y[x];
 201                 dst_y[x] = dst_elem;
 202             }
 203         }
 204
 205
 206         template <>
 207         __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
 208                                  const uchar* src1, size_t src1_step,
 209                                  const uchar* src2, size_t src2_step,
 210                                  const uchar* src3, size_t src3_step,
 211                                  int rows, int cols, uchar* dst, size_t dst_step)
 212         {
 213             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 214             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 215
 216             const double* src0_y = (const double*)(src0 + y * src0_step);
 217             const double* src1_y = (const double*)(src1 + y * src1_step);
 218             const double* src2_y = (const double*)(src2 + y * src2_step);
 219             const double* src3_y = (const double*)(src3 + y * src3_step);
 220             double2* dst_y = (double2*)(dst + y * dst_step);
 221
 222             if (x < cols && y < rows)
 223             {
 224                 dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
 225                 dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
 226             }
 227         }
 228
 229
 230         template <typename T>
 231         static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
 232         {
 233             dim3 block(32, 8);
 234             dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 235             mergeC2_<T><<<grid, block, 0, stream>>>(
 236                     src[0].data, src[0].step,
 237                     src[1].data, src[1].step,
 238                     dst.rows, dst.cols, dst.data, dst.step);
 239             cudaSafeCall( cudaGetLastError() );
 240
 241             if (stream == 0)
 242                 cudaSafeCall(cudaDeviceSynchronize());
 243         }
 244
 245
 246         template <typename T>
 247         static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
 248         {
 249             dim3 block(32, 8);
 250             dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 251             mergeC3_<T><<<grid, block, 0, stream>>>(
 252                     src[0].data, src[0].step,
 253                     src[1].data, src[1].step,
 254                     src[2].data, src[2].step,
 255                     dst.rows, dst.cols, dst.data, dst.step);
 256             cudaSafeCall( cudaGetLastError() );
 257
 258             if (stream == 0)
 259                 cudaSafeCall(cudaDeviceSynchronize());
 260         }
 261
 262
 263         template <typename T>
 264         static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
 265         {
 266             dim3 block(32, 8);
 267             dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 268             mergeC4_<T><<<grid, block, 0, stream>>>(
 269                     src[0].data, src[0].step,
 270                     src[1].data, src[1].step,
 271                     src[2].data, src[2].step,
 272                     src[3].data, src[3].step,
 273                     dst.rows, dst.cols, dst.data, dst.step);
 274             cudaSafeCall( cudaGetLastError() );
 275
 276             if (stream == 0)
 277                 cudaSafeCall(cudaDeviceSynchronize());
 278         }
 279
 280
 281         void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
 282                                      int total_channels, size_t elem_size,
 283                                      const cudaStream_t& stream)
 284         {
 285             static MergeFunction merge_func_tbl[] =
 286             {
 287                 mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
 288                 mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
 289                 mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
 290             };
 291
 292             size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
 293             MergeFunction merge_func = merge_func_tbl[merge_func_id];
 294
 295             if (merge_func == 0)
 296                 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "merge_caller");
 297
 298             merge_func(src, dst, stream);
 299         }
 300
 301
 302
 303         //------------------------------------------------------------
 304         // Split
 305
 306
 307         template <typename T>
 308         __global__ void splitC2_(const uchar* src, size_t src_step,
 309                                 int rows, int cols,
 310                                 uchar* dst0, size_t dst0_step,
 311                                 uchar* dst1, size_t dst1_step)
 312         {
 313             typedef typename TypeTraits<T>::type2 src_type;
 314
 315             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 316             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 317
 318             const src_type* src_y = (const src_type*)(src + y * src_step);
 319             T* dst0_y = (T*)(dst0 + y * dst0_step);
 320             T* dst1_y = (T*)(dst1 + y * dst1_step);
 321
 322             if (x < cols && y < rows)
 323             {
 324                 src_type src_elem = src_y[x];
 325                 dst0_y[x] = src_elem.x;
 326                 dst1_y[x] = src_elem.y;
 327             }
 328         }
 329
 330
 331         template <typename T>
 332         __global__ void splitC3_(const uchar* src, size_t src_step,
 333                                 int rows, int cols,
 334                                 uchar* dst0, size_t dst0_step,
 335                                 uchar* dst1, size_t dst1_step,
 336                                 uchar* dst2, size_t dst2_step)
 337         {
 338             typedef typename TypeTraits<T>::type3 src_type;
 339
 340             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 341             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 342
 343             const src_type* src_y = (const src_type*)(src + y * src_step);
 344             T* dst0_y = (T*)(dst0 + y * dst0_step);
 345             T* dst1_y = (T*)(dst1 + y * dst1_step);
 346             T* dst2_y = (T*)(dst2 + y * dst2_step);
 347
 348             if (x < cols && y < rows)
 349             {
 350                 src_type src_elem = src_y[x];
 351                 dst0_y[x] = src_elem.x;
 352                 dst1_y[x] = src_elem.y;
 353                 dst2_y[x] = src_elem.z;
 354             }
 355         }
 356
 357
 358         template <>
 359         __global__ void splitC3_<double>(
 360                 const uchar* src, size_t src_step, int rows, int cols,
 361                 uchar* dst0, size_t dst0_step,
 362                 uchar* dst1, size_t dst1_step,
 363                 uchar* dst2, size_t dst2_step)
 364         {
 365             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 366             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 367
 368             const double* src_y = (const double*)(src + y * src_step);
 369             double* dst0_y = (double*)(dst0 + y * dst0_step);
 370             double* dst1_y = (double*)(dst1 + y * dst1_step);
 371             double* dst2_y = (double*)(dst2 + y * dst2_step);
 372
 373             if (x < cols && y < rows)
 374             {
 375                 dst0_y[x] = src_y[3 * x];
 376                 dst1_y[x] = src_y[3 * x + 1];
 377                 dst2_y[x] = src_y[3 * x + 2];
 378             }
 379         }
 380
 381
 382         template <typename T>
 383         __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
 384                                 uchar* dst0, size_t dst0_step,
 385                                 uchar* dst1, size_t dst1_step,
 386                                 uchar* dst2, size_t dst2_step,
 387                                 uchar* dst3, size_t dst3_step)
 388         {
 389             typedef typename TypeTraits<T>::type4 src_type;
 390
 391             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 392             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 393
 394             const src_type* src_y = (const src_type*)(src + y * src_step);
 395             T* dst0_y = (T*)(dst0 + y * dst0_step);
 396             T* dst1_y = (T*)(dst1 + y * dst1_step);
 397             T* dst2_y = (T*)(dst2 + y * dst2_step);
 398             T* dst3_y = (T*)(dst3 + y * dst3_step);
 399
 400             if (x < cols && y < rows)
 401             {
 402                 src_type src_elem = src_y[x];
 403                 dst0_y[x] = src_elem.x;
 404                 dst1_y[x] = src_elem.y;
 405                 dst2_y[x] = src_elem.z;
 406                 dst3_y[x] = src_elem.w;
 407             }
 408         }
 409
 410
 411         template <>
 412         __global__ void splitC4_<double>(
 413                 const uchar* src, size_t src_step, int rows, int cols,
 414                 uchar* dst0, size_t dst0_step,
 415                 uchar* dst1, size_t dst1_step,
 416                 uchar* dst2, size_t dst2_step,
 417                 uchar* dst3, size_t dst3_step)
 418         {
 419             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 420             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 421
 422             const double2* src_y = (const double2*)(src + y * src_step);
 423             double* dst0_y = (double*)(dst0 + y * dst0_step);
 424             double* dst1_y = (double*)(dst1 + y * dst1_step);
 425             double* dst2_y = (double*)(dst2 + y * dst2_step);
 426             double* dst3_y = (double*)(dst3 + y * dst3_step);
 427
 428             if (x < cols && y < rows)
 429             {
 430                 double2 src_elem1 = src_y[2 * x];
 431                 double2 src_elem2 = src_y[2 * x + 1];
 432                 dst0_y[x] = src_elem1.x;
 433                 dst1_y[x] = src_elem1.y;
 434                 dst2_y[x] = src_elem2.x;
 435                 dst3_y[x] = src_elem2.y;
 436             }
 437         }
 438
 439         template <typename T>
 440         static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
 441         {
 442             dim3 block(32, 8);
 443             dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 444             splitC2_<T><<<grid, block, 0, stream>>>(
 445                     src.data, src.step, src.rows, src.cols,
 446                     dst[0].data, dst[0].step,
 447                     dst[1].data, dst[1].step);
 448             cudaSafeCall( cudaGetLastError() );
 449
 450             if (stream == 0)
 451                 cudaSafeCall(cudaDeviceSynchronize());
 452         }
 453
 454
 455         template <typename T>
 456         static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
 457         {
 458             dim3 block(32, 8);
 459             dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 460             splitC3_<T><<<grid, block, 0, stream>>>(
 461                     src.data, src.step, src.rows, src.cols,
 462                     dst[0].data, dst[0].step,
 463                     dst[1].data, dst[1].step,
 464                     dst[2].data, dst[2].step);
 465             cudaSafeCall( cudaGetLastError() );
 466
 467             if (stream == 0)
 468                 cudaSafeCall(cudaDeviceSynchronize());
 469         }
 470
 471
 472         template <typename T>
 473         static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
 474         {
 475             dim3 block(32, 8);
 476             dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 477             splitC4_<T><<<grid, block, 0, stream>>>(
 478                      src.data, src.step, src.rows, src.cols,
 479                      dst[0].data, dst[0].step,
 480                      dst[1].data, dst[1].step,
 481                      dst[2].data, dst[2].step,
 482                      dst[3].data, dst[3].step);
 483             cudaSafeCall( cudaGetLastError() );
 484
 485             if (stream == 0)
 486                 cudaSafeCall(cudaDeviceSynchronize());
 487         }
 488
 489
 490         void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
 491         {
 492             static SplitFunction split_func_tbl[] =
 493             {
 494                 splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
 495                 splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
 496                 splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
 497             };
 498
 499             size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
 500             SplitFunction split_func = split_func_tbl[split_func_id];
 501
 502             if (split_func == 0)
 503                 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "split_caller");
 504
 505             split_func(src, dst, stream);
 506         }
 507     } // namespace split_merge
 508 }}} // namespace cv { namespace gpu { namespace device
 509
 510
 511 #endif /* CUDA_DISABLER */