1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other GpuMaterials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "internal_shared.hpp"
47 namespace cv { namespace gpu { namespace device
51 template <typename T, size_t elem_size = sizeof(T)>
61 struct TypeTraits<T, 1>
70 struct TypeTraits<T, 2>
79 struct TypeTraits<T, 4>
88 struct TypeTraits<T, 8>
91 typedef double2 type2;
92 //typedef double3 type3;
93 //typedef double4 type3;
96 typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
97 typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
99 //------------------------------------------------------------
102 template <typename T>
103 __global__ void mergeC2_(const uchar* src0, size_t src0_step,
104 const uchar* src1, size_t src1_step,
105 int rows, int cols, uchar* dst, size_t dst_step)
107 typedef typename TypeTraits<T>::type2 dst_type;
109 const int x = blockIdx.x * blockDim.x + threadIdx.x;
110 const int y = blockIdx.y * blockDim.y + threadIdx.y;
112 const T* src0_y = (const T*)(src0 + y * src0_step);
113 const T* src1_y = (const T*)(src1 + y * src1_step);
114 dst_type* dst_y = (dst_type*)(dst + y * dst_step);
116 if (x < cols && y < rows)
119 dst_elem.x = src0_y[x];
120 dst_elem.y = src1_y[x];
126 template <typename T>
127 __global__ void mergeC3_(const uchar* src0, size_t src0_step,
128 const uchar* src1, size_t src1_step,
129 const uchar* src2, size_t src2_step,
130 int rows, int cols, uchar* dst, size_t dst_step)
132 typedef typename TypeTraits<T>::type3 dst_type;
134 const int x = blockIdx.x * blockDim.x + threadIdx.x;
135 const int y = blockIdx.y * blockDim.y + threadIdx.y;
137 const T* src0_y = (const T*)(src0 + y * src0_step);
138 const T* src1_y = (const T*)(src1 + y * src1_step);
139 const T* src2_y = (const T*)(src2 + y * src2_step);
140 dst_type* dst_y = (dst_type*)(dst + y * dst_step);
142 if (x < cols && y < rows)
145 dst_elem.x = src0_y[x];
146 dst_elem.y = src1_y[x];
147 dst_elem.z = src2_y[x];
154 __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
155 const uchar* src1, size_t src1_step,
156 const uchar* src2, size_t src2_step,
157 int rows, int cols, uchar* dst, size_t dst_step)
159 const int x = blockIdx.x * blockDim.x + threadIdx.x;
160 const int y = blockIdx.y * blockDim.y + threadIdx.y;
162 const double* src0_y = (const double*)(src0 + y * src0_step);
163 const double* src1_y = (const double*)(src1 + y * src1_step);
164 const double* src2_y = (const double*)(src2 + y * src2_step);
165 double* dst_y = (double*)(dst + y * dst_step);
167 if (x < cols && y < rows)
169 dst_y[3 * x] = src0_y[x];
170 dst_y[3 * x + 1] = src1_y[x];
171 dst_y[3 * x + 2] = src2_y[x];
176 template <typename T>
177 __global__ void mergeC4_(const uchar* src0, size_t src0_step,
178 const uchar* src1, size_t src1_step,
179 const uchar* src2, size_t src2_step,
180 const uchar* src3, size_t src3_step,
181 int rows, int cols, uchar* dst, size_t dst_step)
183 typedef typename TypeTraits<T>::type4 dst_type;
185 const int x = blockIdx.x * blockDim.x + threadIdx.x;
186 const int y = blockIdx.y * blockDim.y + threadIdx.y;
188 const T* src0_y = (const T*)(src0 + y * src0_step);
189 const T* src1_y = (const T*)(src1 + y * src1_step);
190 const T* src2_y = (const T*)(src2 + y * src2_step);
191 const T* src3_y = (const T*)(src3 + y * src3_step);
192 dst_type* dst_y = (dst_type*)(dst + y * dst_step);
194 if (x < cols && y < rows)
197 dst_elem.x = src0_y[x];
198 dst_elem.y = src1_y[x];
199 dst_elem.z = src2_y[x];
200 dst_elem.w = src3_y[x];
207 __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
208 const uchar* src1, size_t src1_step,
209 const uchar* src2, size_t src2_step,
210 const uchar* src3, size_t src3_step,
211 int rows, int cols, uchar* dst, size_t dst_step)
213 const int x = blockIdx.x * blockDim.x + threadIdx.x;
214 const int y = blockIdx.y * blockDim.y + threadIdx.y;
216 const double* src0_y = (const double*)(src0 + y * src0_step);
217 const double* src1_y = (const double*)(src1 + y * src1_step);
218 const double* src2_y = (const double*)(src2 + y * src2_step);
219 const double* src3_y = (const double*)(src3 + y * src3_step);
220 double2* dst_y = (double2*)(dst + y * dst_step);
222 if (x < cols && y < rows)
224 dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
225 dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
230 template <typename T>
231 static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
234 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
235 mergeC2_<T><<<grid, block, 0, stream>>>(
236 src[0].data, src[0].step,
237 src[1].data, src[1].step,
238 dst.rows, dst.cols, dst.data, dst.step);
239 cudaSafeCall( cudaGetLastError() );
242 cudaSafeCall(cudaDeviceSynchronize());
246 template <typename T>
247 static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
250 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
251 mergeC3_<T><<<grid, block, 0, stream>>>(
252 src[0].data, src[0].step,
253 src[1].data, src[1].step,
254 src[2].data, src[2].step,
255 dst.rows, dst.cols, dst.data, dst.step);
256 cudaSafeCall( cudaGetLastError() );
259 cudaSafeCall(cudaDeviceSynchronize());
263 template <typename T>
264 static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
267 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
268 mergeC4_<T><<<grid, block, 0, stream>>>(
269 src[0].data, src[0].step,
270 src[1].data, src[1].step,
271 src[2].data, src[2].step,
272 src[3].data, src[3].step,
273 dst.rows, dst.cols, dst.data, dst.step);
274 cudaSafeCall( cudaGetLastError() );
277 cudaSafeCall(cudaDeviceSynchronize());
281 void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
282 int total_channels, size_t elem_size,
283 const cudaStream_t& stream)
285 static MergeFunction merge_func_tbl[] =
287 mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
288 mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
289 mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
292 size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
293 MergeFunction merge_func = merge_func_tbl[merge_func_id];
296 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "merge_caller");
298 merge_func(src, dst, stream);
303 //------------------------------------------------------------
307 template <typename T>
308 __global__ void splitC2_(const uchar* src, size_t src_step,
310 uchar* dst0, size_t dst0_step,
311 uchar* dst1, size_t dst1_step)
313 typedef typename TypeTraits<T>::type2 src_type;
315 const int x = blockIdx.x * blockDim.x + threadIdx.x;
316 const int y = blockIdx.y * blockDim.y + threadIdx.y;
318 const src_type* src_y = (const src_type*)(src + y * src_step);
319 T* dst0_y = (T*)(dst0 + y * dst0_step);
320 T* dst1_y = (T*)(dst1 + y * dst1_step);
322 if (x < cols && y < rows)
324 src_type src_elem = src_y[x];
325 dst0_y[x] = src_elem.x;
326 dst1_y[x] = src_elem.y;
331 template <typename T>
332 __global__ void splitC3_(const uchar* src, size_t src_step,
334 uchar* dst0, size_t dst0_step,
335 uchar* dst1, size_t dst1_step,
336 uchar* dst2, size_t dst2_step)
338 typedef typename TypeTraits<T>::type3 src_type;
340 const int x = blockIdx.x * blockDim.x + threadIdx.x;
341 const int y = blockIdx.y * blockDim.y + threadIdx.y;
343 const src_type* src_y = (const src_type*)(src + y * src_step);
344 T* dst0_y = (T*)(dst0 + y * dst0_step);
345 T* dst1_y = (T*)(dst1 + y * dst1_step);
346 T* dst2_y = (T*)(dst2 + y * dst2_step);
348 if (x < cols && y < rows)
350 src_type src_elem = src_y[x];
351 dst0_y[x] = src_elem.x;
352 dst1_y[x] = src_elem.y;
353 dst2_y[x] = src_elem.z;
359 __global__ void splitC3_<double>(
360 const uchar* src, size_t src_step, int rows, int cols,
361 uchar* dst0, size_t dst0_step,
362 uchar* dst1, size_t dst1_step,
363 uchar* dst2, size_t dst2_step)
365 const int x = blockIdx.x * blockDim.x + threadIdx.x;
366 const int y = blockIdx.y * blockDim.y + threadIdx.y;
368 const double* src_y = (const double*)(src + y * src_step);
369 double* dst0_y = (double*)(dst0 + y * dst0_step);
370 double* dst1_y = (double*)(dst1 + y * dst1_step);
371 double* dst2_y = (double*)(dst2 + y * dst2_step);
373 if (x < cols && y < rows)
375 dst0_y[x] = src_y[3 * x];
376 dst1_y[x] = src_y[3 * x + 1];
377 dst2_y[x] = src_y[3 * x + 2];
382 template <typename T>
383 __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
384 uchar* dst0, size_t dst0_step,
385 uchar* dst1, size_t dst1_step,
386 uchar* dst2, size_t dst2_step,
387 uchar* dst3, size_t dst3_step)
389 typedef typename TypeTraits<T>::type4 src_type;
391 const int x = blockIdx.x * blockDim.x + threadIdx.x;
392 const int y = blockIdx.y * blockDim.y + threadIdx.y;
394 const src_type* src_y = (const src_type*)(src + y * src_step);
395 T* dst0_y = (T*)(dst0 + y * dst0_step);
396 T* dst1_y = (T*)(dst1 + y * dst1_step);
397 T* dst2_y = (T*)(dst2 + y * dst2_step);
398 T* dst3_y = (T*)(dst3 + y * dst3_step);
400 if (x < cols && y < rows)
402 src_type src_elem = src_y[x];
403 dst0_y[x] = src_elem.x;
404 dst1_y[x] = src_elem.y;
405 dst2_y[x] = src_elem.z;
406 dst3_y[x] = src_elem.w;
412 __global__ void splitC4_<double>(
413 const uchar* src, size_t src_step, int rows, int cols,
414 uchar* dst0, size_t dst0_step,
415 uchar* dst1, size_t dst1_step,
416 uchar* dst2, size_t dst2_step,
417 uchar* dst3, size_t dst3_step)
419 const int x = blockIdx.x * blockDim.x + threadIdx.x;
420 const int y = blockIdx.y * blockDim.y + threadIdx.y;
422 const double2* src_y = (const double2*)(src + y * src_step);
423 double* dst0_y = (double*)(dst0 + y * dst0_step);
424 double* dst1_y = (double*)(dst1 + y * dst1_step);
425 double* dst2_y = (double*)(dst2 + y * dst2_step);
426 double* dst3_y = (double*)(dst3 + y * dst3_step);
428 if (x < cols && y < rows)
430 double2 src_elem1 = src_y[2 * x];
431 double2 src_elem2 = src_y[2 * x + 1];
432 dst0_y[x] = src_elem1.x;
433 dst1_y[x] = src_elem1.y;
434 dst2_y[x] = src_elem2.x;
435 dst3_y[x] = src_elem2.y;
439 template <typename T>
440 static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
443 dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
444 splitC2_<T><<<grid, block, 0, stream>>>(
445 src.data, src.step, src.rows, src.cols,
446 dst[0].data, dst[0].step,
447 dst[1].data, dst[1].step);
448 cudaSafeCall( cudaGetLastError() );
451 cudaSafeCall(cudaDeviceSynchronize());
455 template <typename T>
456 static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
459 dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
460 splitC3_<T><<<grid, block, 0, stream>>>(
461 src.data, src.step, src.rows, src.cols,
462 dst[0].data, dst[0].step,
463 dst[1].data, dst[1].step,
464 dst[2].data, dst[2].step);
465 cudaSafeCall( cudaGetLastError() );
468 cudaSafeCall(cudaDeviceSynchronize());
472 template <typename T>
473 static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
476 dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
477 splitC4_<T><<<grid, block, 0, stream>>>(
478 src.data, src.step, src.rows, src.cols,
479 dst[0].data, dst[0].step,
480 dst[1].data, dst[1].step,
481 dst[2].data, dst[2].step,
482 dst[3].data, dst[3].step);
483 cudaSafeCall( cudaGetLastError() );
486 cudaSafeCall(cudaDeviceSynchronize());
490 void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
492 static SplitFunction split_func_tbl[] =
494 splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
495 splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
496 splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
499 size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
500 SplitFunction split_func = split_func_tbl[split_func_id];
503 cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__, "split_caller");
505 split_func(src, dst, stream);
507 } // namespace split_merge
508 }}} // namespace cv { namespace gpu { namespace device
511 #endif /* CUDA_DISABLER */