--- /dev/null
+#include "opencv2/gpu/devmem2d.hpp"\r
+#include "cuda_shared.hpp"\r
+\r
+namespace cv { namespace gpu { namespace split_merge {\r
+\r
+ template <typename T, size_t elem_size = sizeof(T)>\r
+ struct TypeTraits \r
+ {\r
+ typedef T type;\r
+ typedef T type2;\r
+ typedef T type3;\r
+ typedef T type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 1>\r
+ {\r
+ typedef char type;\r
+ typedef char2 type2;\r
+ typedef char3 type3;\r
+ typedef char4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 2>\r
+ {\r
+ typedef short type;\r
+ typedef short2 type2;\r
+ typedef short3 type3;\r
+ typedef short4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 4> \r
+ {\r
+ typedef int type;\r
+ typedef int2 type2;\r
+ typedef int3 type3;\r
+ typedef int4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 8> \r
+ {\r
+ typedef double type;\r
+ typedef double2 type2;\r
+ //typedef double3 type3;\r
+ //typedef double4 type3;\r
+ };\r
+\r
+ typedef void (*MergeFunction)(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream);\r
+ typedef void (*SplitFunction)(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream);\r
+\r
+ //------------------------------------------------------------\r
+ // Merge\r
+\r
+ template <typename T> \r
+ static void mergeC2_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].ptr, src[0].step, \r
+ src[1].ptr, src[1].step,\r
+ dst.rows, dst.cols, dst.ptr, dst.step);\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T> \r
+ static void mergeC3_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].ptr, src[0].step, \r
+ src[1].ptr, src[1].step,\r
+ src[2].ptr, src[2].step,\r
+ dst.rows, dst.cols, dst.ptr, dst.step);\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T> \r
+ static void mergeC4_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].ptr, src[0].step, \r
+ src[1].ptr, src[1].step,\r
+ src[2].ptr, src[2].step,\r
+ src[3].ptr, src[3].step,\r
+ dst.rows, dst.cols, dst.ptr, dst.step);\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst, \r
+ int total_channels, int elem_size, \r
+ const cudaStream_t& stream) \r
+ {\r
+ static MergeFunction merge_func_tbl[] = \r
+ {\r
+ mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
+ mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
+ mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
+ };\r
+\r
+ int merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
+ MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
+\r
+ if (merge_func == 0)\r
+ cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
+\r
+ merge_func(src, dst, stream);\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type2 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type3 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ const T* src2_y = (const T*)(src2 + y * src2_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_elem.z = src2_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src0_y = (const double*)(src0 + y * src0_step);\r
+ const double* src1_y = (const double*)(src1 + y * src1_step);\r
+ const double* src2_y = (const double*)(src2 + y * src2_step);\r
+ double* dst_y = (double*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_y[3 * x] = src0_y[x];\r
+ dst_y[3 * x + 1] = src1_y[x];\r
+ dst_y[3 * x + 2] = src2_y[x];\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ const uchar* src3, size_t src3_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type4 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ const T* src2_y = (const T*)(src2 + y * src2_step);\r
+ const T* src3_y = (const T*)(src3 + y * src3_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_elem.z = src2_y[x];\r
+ dst_elem.w = src3_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ const uchar* src3, size_t src3_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src0_y = (const double*)(src0 + y * src0_step);\r
+ const double* src1_y = (const double*)(src1 + y * src1_step);\r
+ const double* src2_y = (const double*)(src2 + y * src2_step);\r
+ const double* src3_y = (const double*)(src3 + y * src3_step);\r
+ double2* dst_y = (double2*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
+ dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
+ }\r
+ }\r
+\r
+ //------------------------------------------------------------\r
+ // Split\r
+\r
+\r
+ template <typename T> \r
+ static void splitC2_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.ptr, src.step, src.rows, src.cols,\r
+ dst[0].ptr, dst[0].step, \r
+ dst[1].ptr, dst[1].step);\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T> \r
+ static void splitC3_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.ptr, src.step, src.rows, src.cols,\r
+ dst[0].ptr, dst[0].step, \r
+ dst[1].ptr, dst[1].step,\r
+ dst[2].ptr, dst[2].step); \r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T> \r
+ static void splitC4_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.ptr, src.step, src.rows, src.cols,\r
+ dst[0].ptr, dst[0].step, \r
+ dst[1].ptr, dst[1].step,\r
+ dst[2].ptr, dst[2].step,\r
+ dst[3].ptr, dst[3].step);\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+\r
+\r
+ extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst, \r
+ int num_channels, int elem_size1, \r
+ const cudaStream_t& stream) \r
+ {\r
+ static SplitFunction split_func_tbl[] = \r
+ {\r
+ splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
+ splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
+ splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
+ };\r
+\r
+ int split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
+ SplitFunction split_func = split_func_tbl[split_func_id];\r
+\r
+ if (split_func == 0)\r
+ cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
+\r
+ split_func(src, dst, stream);\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC2_(const uchar* src, size_t src_step, \r
+ int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type2 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC3_(const uchar* src, size_t src_step, \r
+ int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type3 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+ T* dst2_y = (T*)(dst2 + y * dst2_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ dst2_y[x] = src_elem.z;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void splitC3_<double>(\r
+ const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src_y = (const double*)(src + y * src_step);\r
+ double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+ double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+ double* dst2_y = (double*)(dst2 + y * dst2_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ dst0_y[x] = src_y[3 * x];\r
+ dst1_y[x] = src_y[3 * x + 1];\r
+ dst2_y[x] = src_y[3 * x + 2];\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step,\r
+ uchar* dst3, size_t dst3_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type4 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+ T* dst2_y = (T*)(dst2 + y * dst2_step);\r
+ T* dst3_y = (T*)(dst3 + y * dst3_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ dst2_y[x] = src_elem.z;\r
+ dst3_y[x] = src_elem.w;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void splitC4_<double>(\r
+ const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step,\r
+ uchar* dst3, size_t dst3_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double2* src_y = (const double2*)(src + y * src_step);\r
+ double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+ double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+ double* dst2_y = (double*)(dst2 + y * dst2_step);\r
+ double* dst3_y = (double*)(dst3 + y * dst3_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ double2 src_elem1 = src_y[2 * x];\r
+ double2 src_elem2 = src_y[2 * x + 1];\r
+ dst0_y[x] = src_elem1.x;\r
+ dst1_y[x] = src_elem1.y;\r
+ dst2_y[x] = src_elem2.x;\r
+ dst3_y[x] = src_elem2.y;\r
+ }\r
+ }\r
+\r
+}}} // namespace cv::gpu::split_merge
\ No newline at end of file
--- /dev/null
+#include "precomp.hpp"\r
+#include <vector>\r
+\r
+using namespace std;\r
+\r
+#if !defined (HAVE_CUDA)\r
+\r
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); }\r
+void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }\r
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }\r
+void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }\r
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); }\r
+void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/) { throw_nogpu(); }\r
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }\r
+void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }\r
+\r
+#else /* !defined (HAVE_CUDA) */\r
+\r
+namespace cv { namespace gpu { namespace split_merge \r
+{ \r
+ extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst, \r
+ int total_channels, int elem_size, \r
+ const cudaStream_t& stream);\r
+\r
+ extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst, \r
+ int num_channels, int elem_size1, \r
+ const cudaStream_t& stream);\r
+\r
+ void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) \r
+ {\r
+ CV_Assert(src);\r
+ CV_Assert(n > 0);\r
+\r
+ int depth = src[0].depth();\r
+ Size size = src[0].size();\r
+\r
+ bool single_channel_only = true;\r
+ int total_channels = 0;\r
+\r
+ for (size_t i = 0; i < n; ++i)\r
+ {\r
+ CV_Assert(src[i].size() == size);\r
+ CV_Assert(src[i].depth() == depth);\r
+ single_channel_only = single_channel_only && src[i].channels() == 1;\r
+ total_channels += src[i].channels();\r
+ }\r
+\r
+ CV_Assert(single_channel_only);\r
+ CV_Assert(total_channels <= 4);\r
+\r
+ if (total_channels == 1) \r
+ src[0].copyTo(dst);\r
+ else \r
+ {\r
+ dst.create(size, CV_MAKETYPE(depth, total_channels));\r
+\r
+ DevMem2D src_as_devmem[4];\r
+ for(size_t i = 0; i < n; ++i)\r
+ src_as_devmem[i] = src[i];\r
+\r
+ split_merge::merge_caller(src_as_devmem, (DevMem2D)dst, \r
+ total_channels, CV_ELEM_SIZE(depth), \r
+ stream);\r
+ } \r
+ }\r
+\r
+\r
+ void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) \r
+ {\r
+ CV_Assert(dst);\r
+\r
+ int depth = src.depth();\r
+ int num_channels = src.channels();\r
+ Size size = src.size();\r
+\r
+ if (num_channels == 1)\r
+ {\r
+ src.copyTo(dst[0]);\r
+ return;\r
+ }\r
+\r
+ for (int i = 0; i < num_channels; ++i)\r
+ dst[i].create(src.size(), depth);\r
+\r
+ CV_Assert(num_channels <= 4);\r
+\r
+ DevMem2D dst_as_devmem[4];\r
+ for (int i = 0; i < num_channels; ++i)\r
+ dst_as_devmem[i] = dst[i];\r
+\r
+ split_merge::split_caller((DevMem2D)src, dst_as_devmem, \r
+ num_channels, src.elemSize1(), \r
+ stream);\r
+ }\r
+\r
+\r
+}}}\r
+\r
+\r
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) \r
+{ \r
+ split_merge::merge(src, n, dst, 0);\r
+}\r
+\r
+\r
+void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst) \r
+{\r
+ split_merge::merge(&src[0], src.size(), dst, 0);\r
+}\r
+\r
+\r
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream) \r
+{ \r
+ split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));\r
+}\r
+\r
+\r
+void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream) \r
+{\r
+ split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));\r
+}\r
+\r
+\r
+void cv::gpu::split(const GpuMat& src, GpuMat* dst) \r
+{\r
+ split_merge::split(src, dst, 0);\r
+}\r
+\r
+\r
+void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst) \r
+{\r
+ dst.resize(src.channels());\r
+ if(src.channels() > 0)\r
+ split_merge::split(src, &dst[0], 0);\r
+}\r
+\r
+\r
+void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream) \r
+{\r
+ split_merge::split(src, dst, StreamAccessor::getStream(stream));\r
+}\r
+\r
+\r
+void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream) \r
+{\r
+ dst.resize(src.channels());\r
+ if(src.channels() > 0)\r
+ split_merge::split(src, &dst[0], StreamAccessor::getStream(stream));\r
+}\r
+\r
+#endif /* !defined (HAVE_CUDA) */
\ No newline at end of file
--- /dev/null
+#include "gputest.hpp"\r
+#include <opencv2/opencv.hpp>\r
+#include <opencv2/gpu/gpu.hpp>\r
+\r
+#include <iostream>\r
+#include <string>\r
+#include <vector>\r
+\r
+using namespace std;\r
+using namespace cv;\r
+\r
+struct CV_MergeTest : public CvTest\r
+{\r
+ CV_MergeTest() : CvTest("GPU-Merge", "merge") {}\r
+ void can_merge(size_t rows, size_t cols);\r
+ void can_merge_submatrixes(size_t rows, size_t cols);\r
+ void run(int);\r
+} merge_test;\r
+\r
+\r
+void CV_MergeTest::can_merge(size_t rows, size_t cols)\r
+{\r
+ for (size_t num_channels = 1; num_channels <= 4; ++num_channels)\r
+ for (size_t depth = CV_8U; depth <= CV_64F; ++depth)\r
+ {\r
+ vector<Mat> src;\r
+ for (size_t i = 0; i < num_channels; ++i)\r
+ src.push_back(Mat(rows, cols, depth, Scalar::all(static_cast<double>(i))));\r
+ \r
+ Mat dst(rows, cols, CV_MAKETYPE(depth, num_channels)); \r
+\r
+ cv::merge(src, dst); \r
+\r
+ vector<gpu::GpuMat> dev_src;\r
+ for (size_t i = 0; i < num_channels; ++i)\r
+ dev_src.push_back(gpu::GpuMat(src[i]));\r
+\r
+ gpu::GpuMat dev_dst(rows, cols, CV_MAKETYPE(depth, num_channels));\r
+ cv::gpu::merge(dev_src, dev_dst); \r
+\r
+ Mat host_dst = dev_dst;\r
+\r
+ double err = norm(dst, host_dst, NORM_INF);
+
+ if (err > 1e-3)
+ {
+ //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+ //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+ //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+ //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+ //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+ return;
+ }\r
+ }\r
+}\r
+\r
+\r
+void CV_MergeTest::can_merge_submatrixes(size_t rows, size_t cols)\r
+{\r
+ for (size_t num_channels = 1; num_channels <= 4; ++num_channels)\r
+ for (size_t depth = CV_8U; depth <= CV_64F; ++depth)\r
+ {\r
+ vector<Mat> src;\r
+ for (size_t i = 0; i < num_channels; ++i) \r
+ {\r
+ Mat m(rows * 2, cols * 2, depth, Scalar::all(static_cast<double>(i)));\r
+ src.push_back(m(Range(rows / 2, rows / 2 + rows), Range(cols / 2, cols / 2 + cols)));\r
+ }\r
+\r
+ Mat dst(rows, cols, CV_MAKETYPE(depth, num_channels)); \r
+\r
+ cv::merge(src, dst); \r
+\r
+ vector<gpu::GpuMat> dev_src;\r
+ for (size_t i = 0; i < num_channels; ++i)\r
+ dev_src.push_back(gpu::GpuMat(src[i]));\r
+\r
+ gpu::GpuMat dev_dst(rows, cols, CV_MAKETYPE(depth, num_channels));\r
+ cv::gpu::merge(dev_src, dev_dst);\r
+\r
+ Mat host_dst = dev_dst;\r
+\r
+ double err = norm(dst, host_dst, NORM_INF);
+
+ if (err > 1e-3)
+ {
+ //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+ //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+ //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+ //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+ //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+ return;
+ }\r
+ }\r
+}\r
+\r
+\r
+void CV_MergeTest::run(int) \r
+{\r
+ try\r
+ {\r
+ can_merge(1, 1);\r
+ can_merge(1, 7);\r
+ can_merge(53, 7);\r
+ can_merge_submatrixes(1, 1);\r
+ can_merge_submatrixes(1, 7);\r
+ can_merge_submatrixes(53, 7);\r
+ }\r
+ catch(const cv::Exception& e)
+ {
+ if (!check_and_treat_gpu_exception(e, ts))
+ throw;
+ } \r
+}\r
+\r
+\r
+struct CV_SplitTest : public CvTest\r
+{\r
+ CV_SplitTest() : CvTest("GPU-Split", "split") {}\r
+ void can_split(size_t rows, size_t cols); \r
+ void can_split_submatrix(size_t rows, size_t cols);\r
+ void run(int);\r
+} split_test;\r
+\r
+\r
+void CV_SplitTest::can_split(size_t rows, size_t cols)\r
+{\r
+ for (size_t num_channels = 1; num_channels <= 4; ++num_channels)\r
+ for (size_t depth = CV_8U; depth <= CV_64F; ++depth)\r
+ {\r
+ Mat src(rows, cols, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0)); \r
+ vector<Mat> dst;\r
+ cv::split(src, dst); \r
+\r
+ gpu::GpuMat dev_src(src);\r
+ vector<gpu::GpuMat> dev_dst;\r
+ cv::gpu::split(dev_src, dev_dst);\r
+\r
+ if (dev_dst.size() != dst.size())\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "Bad output sizes");\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ }\r
+\r
+ for (size_t i = 0; i < num_channels; ++i)\r
+ {\r
+ Mat host_dst = dev_dst[i];\r
+ double err = norm(dst[i], host_dst, NORM_INF);
+
+ if (err > 1e-3)
+ {
+ //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+ //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+ //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+ //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+ //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+ return;
+ }\r
+ }\r
+ }\r
+}\r
+\r
+\r
+\r
+void CV_SplitTest::can_split_submatrix(size_t rows, size_t cols)\r
+{\r
+ for (size_t num_channels = 1; num_channels <= 4; ++num_channels)\r
+ for (size_t depth = CV_8U; depth <= CV_64F; ++depth)\r
+ {\r
+ Mat src_data(rows * 2, cols * 2, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0)); \r
+ Mat src(src_data(Range(rows / 2, rows / 2 + rows), Range(cols / 2, cols / 2 + cols)));\r
+ vector<Mat> dst;\r
+ cv::split(src, dst); \r
+\r
+ gpu::GpuMat dev_src(src);\r
+ vector<gpu::GpuMat> dev_dst;\r
+ cv::gpu::split(dev_src, dev_dst);\r
+\r
+ if (dev_dst.size() != dst.size())\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "Bad output sizes");\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ }\r
+\r
+ for (size_t i = 0; i < num_channels; ++i)\r
+ {\r
+ Mat host_dst = dev_dst[i];\r
+ double err = norm(dst[i], host_dst, NORM_INF);
+
+ if (err > 1e-3)
+ {
+ //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+ //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+ //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+ //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+ //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+ return;
+ }\r
+ }\r
+ }\r
+}\r
+\r
+\r
+void CV_SplitTest::run(int)\r
+{\r
+ try \r
+ {\r
+ can_split(1, 1);\r
+ can_split(1, 7);\r
+ can_split(7, 53);\r
+ can_split_submatrix(1, 1);\r
+ can_split_submatrix(1, 7);\r
+ can_split_submatrix(7, 53);\r
+ }\r
+ catch(const cv::Exception& e)
+ {
+ if (!check_and_treat_gpu_exception(e, ts))
+ throw;
+ } \r
+}\r
+\r
+\r
+struct CV_SplitMergeTest : public CvTest\r
+{\r
+ CV_SplitMergeTest() : CvTest("GPU-SplitMerge", "split merge") {}\r
+ void can_split_merge(size_t rows, size_t cols); \r
+ void run(int);\r
+} split_merge_test;\r
+\r
+\r
+void CV_SplitMergeTest::can_split_merge(size_t rows, size_t cols) {\r
+ for (size_t num_channels = 1; num_channels <= 4; ++num_channels)\r
+ for (size_t depth = CV_8U; depth <= CV_64F; ++depth)\r
+ {\r
+ Mat orig(rows, cols, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0)); \r
+ gpu::GpuMat dev_orig(orig);\r
+ vector<gpu::GpuMat> dev_vec;\r
+ cv::gpu::split(dev_orig, dev_vec);\r
+\r
+ gpu::GpuMat dev_final(rows, cols, CV_MAKETYPE(depth, num_channels));\r
+ cv::gpu::merge(dev_vec, dev_final);\r
+\r
+ double err = cv::norm((Mat)dev_orig, (Mat)dev_final, NORM_INF);
+ if (err > 1e-3)
+ {
+ //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+ //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+ //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+ //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+ //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+ return;
+ }\r
+ }\r
+}\r
+\r
+\r
+void CV_SplitMergeTest::run(int) \r
+{\r
+ try \r
+ {\r
+ can_split_merge(1, 1);\r
+ can_split_merge(1, 7);\r
+ can_split_merge(7, 53);\r
+ }\r
+ catch(const cv::Exception& e)
+ {
+ if (!check_and_treat_gpu_exception(e, ts))
+ throw;
+ } \r
+}\r