added gpu::cvtColor for RGB <-> HSV and RGB <-> HLS
authorVladislav Vinogradov <no@email>
Wed, 13 Oct 2010 09:42:29 +0000 (09:42 +0000)
committerVladislav Vinogradov <no@email>
Wed, 13 Oct 2010 09:42:29 +0000 (09:42 +0000)
modules/gpu/src/cuda/color.cu
modules/gpu/src/imgproc_gpu.cpp
tests/gpu/src/imgproc_gpu.cpp

index 0384b1d..699e285 100644 (file)
@@ -49,13 +49,17 @@ using namespace cv::gpu;
 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))\r
 #endif\r
 \r
+#ifndef FLT_EPSILON\r
+#define FLT_EPSILON     1.192092896e-07F\r
+#endif\r
+\r
 namespace imgproc\r
 {\r
     template<typename T, int N> struct TypeVec {};\r
     template<> struct TypeVec<uchar, 3> { typedef uchar3 vec_t; };\r
     template<> struct TypeVec<uchar, 4> { typedef uchar4 vec_t; };\r
-    template<> struct TypeVec<unsigned short, 3> { typedef ushort3 vec_t; };\r
-    template<> struct TypeVec<unsigned short, 4> { typedef ushort4 vec_t; };\r
+    template<> struct TypeVec<ushort, 3> { typedef ushort3 vec_t; };\r
+    template<> struct TypeVec<ushort, 4> { typedef ushort4 vec_t; };\r
     template<> struct TypeVec<float, 3> { typedef float3 vec_t; };\r
     template<> struct TypeVec<float, 4> { typedef float4 vec_t; };\r
 \r
@@ -63,14 +67,14 @@ namespace imgproc
     template<> struct ColorChannel<uchar>\r
     {\r
         typedef float worktype_f;\r
-        static __device__ unsigned char max() { return UCHAR_MAX; }\r
-        static __device__ unsigned char half() { return (unsigned char)(max()/2 + 1); }\r
+        static __device__ uchar max() { return UCHAR_MAX; }\r
+        static __device__ uchar half() { return (uchar)(max()/2 + 1); }\r
     };\r
-    template<> struct ColorChannel<unsigned short>\r
+    template<> struct ColorChannel<ushort>\r
     {\r
         typedef float worktype_f;\r
-        static __device__ unsigned short max() { return USHRT_MAX; }\r
-        static __device__ unsigned short half() { return (unsigned short)(max()/2 + 1); }\r
+        static __device__ ushort max() { return USHRT_MAX; }\r
+        static __device__ ushort half() { return (ushort)(max()/2 + 1); }\r
     };\r
     template<> struct ColorChannel<float>\r
     {\r
@@ -118,9 +122,9 @@ namespace imgproc
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
             dst_t dst;\r
 \r
-            dst.x = ((const T*)(&src))[bidx];\r
+            dst.x = (&src.x)[bidx];\r
             dst.y = src.y;\r
-            dst.z = ((const T*)(&src))[bidx ^ 2];\r
+            dst.z = (&src.x)[bidx ^ 2];\r
             setAlpha(dst, getAlpha<T>(src));\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
@@ -163,8 +167,8 @@ namespace cv { namespace gpu { namespace improc
         typedef void (*RGB2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const RGB2RGB_caller_t RGB2RGB_callers[2][2] = \r
         {\r
-            {RGB2RGB_caller<unsigned short, 3, 3>, RGB2RGB_caller<unsigned short, 3, 4>}, \r
-            {RGB2RGB_caller<unsigned short, 4, 3>, RGB2RGB_caller<unsigned short, 4, 4>}\r
+            {RGB2RGB_caller<ushort, 3, 3>, RGB2RGB_caller<ushort, 3, 4>}, \r
+            {RGB2RGB_caller<ushort, 4, 3>, RGB2RGB_caller<ushort, 4, 4>}\r
         };\r
 \r
         RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);\r
@@ -192,13 +196,13 @@ namespace imgproc
     {\r
         typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;\r
 \r
-        static __device__ dst_t cvt(unsigned int src, int bidx)\r
+        static __device__ dst_t cvt(uint src, int bidx)\r
         {\r
             dst_t dst;\r
             \r
-            ((uchar*)(&dst))[bidx] = (uchar)(src << 3);\r
+            (&dst.x)[bidx] = (uchar)(src << 3);\r
             dst.y = (uchar)((src >> 2) & ~7);\r
-            ((uchar*)(&dst))[bidx ^ 2] = (uchar)((src >> 7) & ~7);\r
+            (&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7);\r
             setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0));\r
 \r
             return dst;\r
@@ -208,13 +212,13 @@ namespace imgproc
     {\r
         typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;\r
 \r
-        static __device__ dst_t cvt(unsigned int src, int bidx)\r
+        static __device__ dst_t cvt(uint src, int bidx)\r
         {\r
             dst_t dst;\r
             \r
-            ((uchar*)(&dst))[bidx] = (uchar)(src << 3);\r
+            (&dst.x)[bidx] = (uchar)(src << 3);\r
             dst.y = (uchar)((src >> 3) & ~3);\r
-            ((uchar*)(&dst))[bidx ^ 2] = (uchar)((src >> 8) & ~7);\r
+            (&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7);\r
             setAlpha(dst, (uchar)(255));\r
 \r
             return dst;\r
@@ -231,7 +235,7 @@ namespace imgproc
 \r
         if (y < rows && x < cols)\r
         {\r
-            unsigned int src = *(const unsigned short*)(src_ + y * src_step + (x << 1));\r
+            uint src = *(const ushort*)(src_ + y * src_step + (x << 1));\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN) = RGB5x52RGBConverter<GREEN_BITS, DSTCN>::cvt(src, bidx);\r
         }\r
@@ -240,23 +244,23 @@ namespace imgproc
     template <int SRCCN, int GREEN_BITS> struct RGB2RGB5x5Converter {};\r
     template<int SRCCN> struct RGB2RGB5x5Converter<SRCCN, 6> \r
     {\r
-        static __device__ unsigned short cvt(const uchar* src_ptr, int bidx)\r
+        static __device__ ushort cvt(const uchar* src, int bidx)\r
         {\r
-            return (unsigned short)((src_ptr[bidx] >> 3) | ((src_ptr[1] & ~3) << 3) | ((src_ptr[bidx^2] & ~7) << 8));\r
+            return (ushort)((src[bidx] >> 3) | ((src[1] & ~3) << 3) | ((src[bidx^2] & ~7) << 8));\r
         }\r
     };\r
     template<> struct RGB2RGB5x5Converter<3, 5> \r
     {\r
-        static __device__ unsigned short cvt(const uchar* src_ptr, int bidx)\r
+        static __device__ ushort cvt(const uchar* src, int bidx)\r
         {\r
-            return (unsigned short)((src_ptr[bidx] >> 3) | ((src_ptr[1] & ~7) << 2) | ((src_ptr[bidx^2] & ~7) << 7));\r
+            return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7));\r
         }\r
     };\r
     template<> struct RGB2RGB5x5Converter<4, 5> \r
     {\r
-        static __device__ unsigned short cvt(const uchar* src_ptr, int bidx)\r
+        static __device__ ushort cvt(const uchar* src, int bidx)\r
         {\r
-            return (unsigned short)((src_ptr[bidx] >> 3) | ((src_ptr[1] & ~7) << 2) | ((src_ptr[bidx^2] & ~7) << 7) | (src_ptr[3] ? 0x8000 : 0));\r
+            return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7) | (src[3] ? 0x8000 : 0));\r
         }\r
     };    \r
 \r
@@ -272,7 +276,7 @@ namespace imgproc
         {\r
             src_t src = *(src_t*)(src_ + y * src_step + x * SRCCN);\r
 \r
-            *(unsigned short*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt((const uchar*)(&src), bidx);\r
+            *(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx);\r
         }\r
     }\r
 }\r
@@ -363,17 +367,17 @@ namespace imgproc
     template <int GREEN_BITS> struct Gray2RGB5x5Converter {};\r
     template<> struct Gray2RGB5x5Converter<6> \r
     {\r
-        static __device__ unsigned short cvt(unsigned int t)\r
+        static __device__ ushort cvt(uint t)\r
         {\r
-            return (unsigned short)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
+            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
         }\r
     };\r
     template<> struct Gray2RGB5x5Converter<5> \r
     {\r
-        static __device__ unsigned short cvt(unsigned int t)\r
+        static __device__ ushort cvt(uint t)\r
         {\r
             t >>= 3;\r
-            return (unsigned short)(t | (t << 5) | (t << 10));\r
+            return (ushort)(t | (t << 5) | (t << 10));\r
         }\r
     };\r
 \r
@@ -385,9 +389,9 @@ namespace imgproc
 \r
         if (y < rows && x < cols)\r
         {\r
-            unsigned int src = src_[y * src_step + x];\r
+            uint src = src_[y * src_step + x];\r
 \r
-            *(unsigned short*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);\r
+            *(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);\r
         }\r
     }\r
 }\r
@@ -421,7 +425,7 @@ namespace cv { namespace gpu { namespace improc
     void Gray2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)\r
     {\r
         typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
-        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<unsigned short, 3>, Gray2RGB_caller<unsigned short, 4>};\r
+        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<ushort, 3>, Gray2RGB_caller<ushort, 4>};\r
 \r
         Gray2RGB_callers[dstcn - 3](src, dst, stream);\r
     }\r
@@ -483,16 +487,16 @@ namespace imgproc
     template <int GREEN_BITS> struct RGB5x52GrayConverter {};\r
     template<> struct RGB5x52GrayConverter<6> \r
     {\r
-        static __device__ unsigned char cvt(unsigned int t)\r
+        static __device__ uchar cvt(uint t)\r
         {\r
-            return (unsigned char)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
+            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
         }\r
     };\r
     template<> struct RGB5x52GrayConverter<5> \r
     {\r
-        static __device__ unsigned char cvt(unsigned int t)\r
+        static __device__ uchar cvt(uint t)\r
         {\r
-            return (unsigned char)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
+            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
         }\r
     };   \r
 \r
@@ -504,7 +508,7 @@ namespace imgproc
 \r
         if (y < rows && x < cols)\r
         {\r
-            unsigned int src = *(unsigned short*)(src_ + y * src_step + (x << 1));\r
+            uint src = *(ushort*)(src_ + y * src_step + (x << 1));\r
 \r
             dst_[y * dst_step + x] = RGB5x52GrayConverter<GREEN_BITS>::cvt(src);\r
         }\r
@@ -541,7 +545,7 @@ namespace imgproc
         {\r
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
 \r
-            *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt((const T*)(&src), bidx);\r
+            *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx);\r
         }\r
     }   \r
 }\r
@@ -567,7 +571,7 @@ namespace cv { namespace gpu { namespace improc
     void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
-        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<unsigned char, 3>, RGB2Gray_caller<unsigned char, 4>};\r
+        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<uchar, 3>, RGB2Gray_caller<uchar, 4>};\r
 \r
         RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);\r
     }\r
@@ -575,7 +579,7 @@ namespace cv { namespace gpu { namespace improc
     void RGB2Gray_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
-        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<unsigned short, 3>, RGB2Gray_caller<unsigned short, 4>};\r
+        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<ushort, 3>, RGB2Gray_caller<ushort, 4>};\r
 \r
         RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);\r
     }\r
@@ -664,7 +668,7 @@ namespace imgproc
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
             dst_t dst;\r
 \r
-            RGB2YCrCbConverter<T>::cvt(((const T*)(&src)), dst, bidx);\r
+            RGB2YCrCbConverter<T>::cvt(&src.x, dst, bidx);\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
         }\r
@@ -709,7 +713,7 @@ namespace imgproc
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
             dst_t dst;\r
 \r
-            YCrCb2RGBConvertor<T>::cvt(src, ((T*)(&dst)), bidx);\r
+            YCrCb2RGBConvertor<T>::cvt(src, &dst.x, bidx);\r
             setAlpha(dst, ColorChannel<T>::max());\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
@@ -735,7 +739,7 @@ namespace cv { namespace gpu { namespace improc
             cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
-    void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream)\r
+    void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = \r
@@ -749,13 +753,13 @@ namespace cv { namespace gpu { namespace improc
         RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);\r
     }\r
 \r
-    void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream)\r
+    void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = \r
         {\r
-            {RGB2YCrCb_caller<unsigned short, 3, 3>, RGB2YCrCb_caller<unsigned short, 3, 4>},\r
-            {RGB2YCrCb_caller<unsigned short, 4, 3>, RGB2YCrCb_caller<unsigned short, 4, 4>}\r
+            {RGB2YCrCb_caller<ushort, 3, 3>, RGB2YCrCb_caller<ushort, 3, 4>},\r
+            {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}\r
         };\r
         \r
         cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );\r
@@ -763,7 +767,7 @@ namespace cv { namespace gpu { namespace improc
         RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);\r
     }\r
 \r
-    void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream)\r
+    void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = \r
@@ -793,7 +797,7 @@ namespace cv { namespace gpu { namespace improc
             cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
-    void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream)\r
+    void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = \r
@@ -807,13 +811,13 @@ namespace cv { namespace gpu { namespace improc
         YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);\r
     }\r
 \r
-    void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream)\r
+    void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = \r
         {\r
-            {YCrCb2RGB_caller<unsigned short, 3, 3>, YCrCb2RGB_caller<unsigned short, 3, 4>},\r
-            {YCrCb2RGB_caller<unsigned short, 4, 3>, YCrCb2RGB_caller<unsigned short, 4, 4>}\r
+            {YCrCb2RGB_caller<ushort, 3, 3>, YCrCb2RGB_caller<ushort, 3, 4>},\r
+            {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}\r
         };\r
         \r
         cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );\r
@@ -821,7 +825,7 @@ namespace cv { namespace gpu { namespace improc
         YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);\r
     }\r
 \r
-    void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream)\r
+    void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = \r
@@ -878,7 +882,7 @@ namespace imgproc
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
 \r
             dst_t dst;\r
-            RGB2XYZConvertor<T>::cvt((const T*)(&src), dst);\r
+            RGB2XYZConvertor<T>::cvt(&src.x, dst);\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
         }\r
@@ -919,7 +923,7 @@ namespace imgproc
             src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
 \r
             dst_t dst;\r
-            XYZ2RGBConvertor<T>::cvt(src, (T*)(&dst));\r
+            XYZ2RGBConvertor<T>::cvt(src, &dst.x);\r
             setAlpha(dst, ColorChannel<T>::max());\r
             \r
             *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
@@ -945,7 +949,7 @@ namespace cv { namespace gpu { namespace improc
             cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
-    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)\r
+    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = \r
@@ -959,13 +963,13 @@ namespace cv { namespace gpu { namespace improc
         RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);\r
     }\r
 \r
-    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)\r
+    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = \r
         {\r
-            {RGB2XYZ_caller<unsigned short, 3, 3>, RGB2XYZ_caller<unsigned short, 3, 4>},\r
-            {RGB2XYZ_caller<unsigned short, 4, 3>, RGB2XYZ_caller<unsigned short, 4, 4>}\r
+            {RGB2XYZ_caller<ushort, 3, 3>, RGB2XYZ_caller<ushort, 3, 4>},\r
+            {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}\r
         };\r
         \r
         cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );\r
@@ -973,7 +977,7 @@ namespace cv { namespace gpu { namespace improc
         RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);\r
     }\r
 \r
-    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream)\r
+    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = \r
@@ -1003,7 +1007,7 @@ namespace cv { namespace gpu { namespace improc
             cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
-    void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)\r
+    void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = \r
@@ -1017,13 +1021,13 @@ namespace cv { namespace gpu { namespace improc
         XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);\r
     }\r
 \r
-    void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)\r
+    void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = \r
         {\r
-            {XYZ2RGB_caller<unsigned short, 3, 3>, XYZ2RGB_caller<unsigned short, 3, 4>},\r
-            {XYZ2RGB_caller<unsigned short, 4, 3>, XYZ2RGB_caller<unsigned short, 4, 4>}\r
+            {XYZ2RGB_caller<ushort, 3, 3>, XYZ2RGB_caller<ushort, 3, 4>},\r
+            {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}\r
         };\r
         \r
         cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );\r
@@ -1031,7 +1035,7 @@ namespace cv { namespace gpu { namespace improc
         XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);\r
     }\r
 \r
-    void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream)\r
+    void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)\r
     {\r
         typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);\r
         static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = \r
@@ -1048,1587 +1052,593 @@ namespace cv { namespace gpu { namespace improc
 \r
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////\r
 \r
-//struct RGB2HSV_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)\r
-//    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, bidx = blueIdx, scn = srccn;\r
-//        const int hsv_shift = 12;\r
-//\r
-//        static const int div_table[] = {\r
-//            0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,\r
-//            130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,\r
-//            65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,\r
-//            43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,\r
-//            32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,\r
-//            26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,\r
-//            21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,\r
-//            18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,\r
-//            16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,\r
-//            14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,\r
-//            13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,\r
-//            11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,\r
-//            10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,\r
-//            10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,\r
-//            9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,\r
-//            8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,\r
-//            8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,\r
-//            7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,\r
-//            7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,\r
-//            6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,\r
-//            6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,\r
-//            6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,\r
-//            5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,\r
-//            5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,\r
-//            5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,\r
-//            5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,\r
-//            5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,\r
-//            4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,\r
-//            4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,\r
-//            4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,\r
-//            4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,\r
-//            4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096\r
-//        };\r
-//        int hr = hrange, hscale = hr == 180 ? 15 : 21;\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            int b = src[bidx], g = src[1], r = src[bidx^2];\r
-//            int h, s, v = b;\r
-//            int vmin = b, diff;\r
-//            int vr, vg;\r
-//\r
-//            CV_CALC_MAX_8U( v, g );\r
-//            CV_CALC_MAX_8U( v, r );\r
-//            CV_CALC_MIN_8U( vmin, g );\r
-//            CV_CALC_MIN_8U( vmin, r );\r
-//\r
-//            diff = v - vmin;\r
-//            vr = v == r ? -1 : 0;\r
-//            vg = v == g ? -1 : 0;\r
-//\r
-//            s = diff * div_table[v] >> hsv_shift;\r
-//            h = (vr & (g - b)) +\r
-//                (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
-//            h = (h * div_table[diff] * hscale + (1 << (hsv_shift + 6))) >> (7 + hsv_shift);\r
-//            h += h < 0 ? hr : 0;\r
-//\r
-//            dst[i] = (uchar)h;\r
-//            dst[i+1] = (uchar)s;\r
-//            dst[i+2] = (uchar)v;\r
-//        }\r
-//    }\r
-//\r
-//    int srccn, blueIdx, hrange;\r
-//};\r
-//\r
-//\r
-//struct RGB2HSV_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)\r
-//    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, bidx = blueIdx, scn = srccn;\r
-//        float hscale = hrange*(1.f/360.f);\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            float b = src[bidx], g = src[1], r = src[bidx^2];\r
-//            float h, s, v;\r
-//\r
-//            float vmin, diff;\r
-//\r
-//            v = vmin = r;\r
-//            if( v < g ) v = g;\r
-//            if( v < b ) v = b;\r
-//            if( vmin > g ) vmin = g;\r
-//            if( vmin > b ) vmin = b;\r
-//\r
-//            diff = v - vmin;\r
-//            s = diff/(float)(fabs(v) + FLT_EPSILON);\r
-//            diff = (float)(60./(diff + FLT_EPSILON));\r
-//            if( v == r )\r
-//                h = (g - b)*diff;\r
-//            else if( v == g )\r
-//                h = (b - r)*diff + 120.f;\r
-//            else\r
-//                h = (r - g)*diff + 240.f;\r
-//\r
-//            if( h < 0 ) h += 360.f;\r
-//\r
-//            dst[i] = h*hscale;\r
-//            dst[i+1] = s;\r
-//            dst[i+2] = v;\r
-//        }\r
-//    }\r
-//\r
-//    int srccn, blueIdx;\r
-//    float hrange;\r
-//};\r
-//\r
-//\r
-//struct HSV2RGB_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)\r
-//    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, bidx = blueIdx, dcn = dstcn;\r
-//        float _hscale = hscale;\r
-//        float alpha = ColorChannel<float>::max();\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, dst += dcn )\r
-//        {\r
-//            float h = src[i], s = src[i+1], v = src[i+2];\r
-//            float b, g, r;\r
-//\r
-//            if( s == 0 )\r
-//                b = g = r = v;\r
-//            else\r
-//            {\r
-//                static const int sector_data[][3]=\r
-//                    {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
-//                float tab[4];\r
-//                int sector;\r
-//                h *= _hscale;\r
-//                if( h < 0 )\r
-//                    do h += 6; while( h < 0 );\r
-//                else if( h >= 6 )\r
-//                    do h -= 6; while( h >= 6 );\r
-//                sector = cvFloor(h);\r
-//                h -= sector;\r
-//\r
-//                tab[0] = v;\r
-//                tab[1] = v*(1.f - s);\r
-//                tab[2] = v*(1.f - s*h);\r
-//                tab[3] = v*(1.f - s*(1.f - h));\r
-//\r
-//                b = tab[sector_data[sector][0]];\r
-//                g = tab[sector_data[sector][1]];\r
-//                r = tab[sector_data[sector][2]];\r
-//            }\r
-//\r
-//            dst[bidx] = b;\r
-//            dst[1] = g;\r
-//            dst[bidx^2] = r;\r
-//            if( dcn == 4 )\r
-//                dst[3] = alpha;\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn, blueIdx;\r
-//    float hscale;\r
-//};\r
-//\r
-//\r
-//struct HSV2RGB_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)\r
-//    : dstcn(_dstcn), cvt(3, _blueIdx, _hrange)\r
-//    {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, dcn = dstcn;\r
-//        uchar alpha = ColorChannel<uchar>::max();\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                buf[j] = src[j];\r
-//                buf[j+1] = src[j+1]*(1.f/255.f);\r
-//                buf[j+2] = src[j+2]*(1.f/255.f);\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, dst += dcn )\r
-//            {\r
-//                dst[0] = saturate_cast<uchar>(buf[j]*255.f);\r
-//                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);\r
-//                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);\r
-//                if( dcn == 4 )\r
-//                    dst[3] = alpha;\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    HSV2RGB_f cvt;\r
-//};\r
-//\r
-//\r
+namespace imgproc\r
+{\r
+    __constant__ int cHsvDivTable[256];\r
+\r
+    template<typename T, int HR> struct RGB2HSVConvertor;\r
+    template<int HR> struct RGB2HSVConvertor<uchar, HR>\r
+    {\r
+        template <typename D>\r
+        static __device__ void cvt(const uchar* src, D& dst, int bidx)\r
+        {\r
+            const int hsv_shift = 12;\r
+            const int hscale = HR == 180 ? 15 : 21;\r
+\r
+            int b = src[bidx], g = src[1], r = src[bidx^2];\r
+            int h, s, v = b;\r
+            int vmin = b, diff;\r
+            int vr, vg;\r
+\r
+            v = max(v, g);\r
+            v = max(v, r);\r
+            vmin = min(vmin, g);\r
+            vmin = min(vmin, r);\r
+\r
+            diff = v - vmin;\r
+            vr = v == r ? -1 : 0;\r
+            vg = v == g ? -1 : 0;\r
+\r
+            s = diff * cHsvDivTable[v] >> hsv_shift;\r
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
+            h = (h * cHsvDivTable[diff] * hscale + (1 << (hsv_shift + 6))) >> (7 + hsv_shift);\r
+            h += h < 0 ? HR : 0;\r
+\r
+            dst.x = (uchar)h;\r
+            dst.y = (uchar)s;\r
+            dst.z = (uchar)v;\r
+        }\r
+    };\r
+    template<int HR> struct RGB2HSVConvertor<float, HR>\r
+    {\r
+        template <typename D>\r
+        static __device__ void cvt(const float* src, D& dst, int bidx)\r
+        {\r
+            const float hscale = HR * (1.f / 360.f);\r
+\r
+            float b = src[bidx], g = src[1], r = src[bidx^2];\r
+            float h, s, v;\r
+\r
+            float vmin, diff;\r
+\r
+            v = vmin = r;\r
+            v = fmax(v, g);\r
+            v = fmax(v, b);\r
+            vmin = fmin(vmin, g);\r
+            vmin = fmin(vmin, b);\r
+\r
+            diff = v - vmin;\r
+            s = diff / (float)(fabs(v) + FLT_EPSILON);\r
+            diff = (float)(60. / (diff + FLT_EPSILON));\r
+\r
+            if (v == r)\r
+                h = (g - b) * diff;\r
+            else if (v == g)\r
+                h = (b - r) * diff + 120.f;\r
+            else\r
+                h = (r - g) * diff + 240.f;\r
+\r
+            if (h < 0) h += 360.f;\r
+\r
+            dst.x = h * hscale;\r
+            dst.y = s;\r
+            dst.z = v;\r
+        }\r
+    };\r
+\r
+    template <int SRCCN, int DSTCN, int HR, typename T>\r
+    __global__ void RGB2HSV(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)\r
+    {\r
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;\r
+        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;\r
+\r
+               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+        if (y < rows && x < cols)\r
+        {\r
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
+\r
+            dst_t dst;\r
+            RGB2HSVConvertor<T, HR>::cvt(&src.x, dst, bidx);\r
+            \r
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
+        }\r
+    }\r
+\r
+    __constant__ int cHsvSectorData[6][3];\r
+\r
+    template<typename T, int HR> struct HSV2RGBConvertor;    \r
+    template<int HR> struct HSV2RGBConvertor<float, HR>\r
+    {\r
+        template <typename T>\r
+        static __device__ void cvt(const T& src, float* dst, int bidx)\r
+        {\r
+            const float hscale = 6.f / HR;\r
+            \r
+            float h = src.x, s = src.y, v = src.z;\r
+            float b, g, r;\r
+\r
+            if( s == 0 )\r
+                b = g = r = v;\r
+            else\r
+            {\r
+                float tab[4];\r
+                int sector;\r
+                h *= hscale;\r
+                if( h < 0 )\r
+                    do h += 6; while( h < 0 );\r
+                else if( h >= 6 )\r
+                    do h -= 6; while( h >= 6 );\r
+                sector = __float2int_rd(h);\r
+                h -= sector;\r
+\r
+                tab[0] = v;\r
+                tab[1] = v*(1.f - s);\r
+                tab[2] = v*(1.f - s*h);\r
+                tab[3] = v*(1.f - s*(1.f - h));\r
+\r
+                b = tab[cHsvSectorData[sector][0]];\r
+                g = tab[cHsvSectorData[sector][1]];\r
+                r = tab[cHsvSectorData[sector][2]];\r
+            }\r
+\r
+            dst[bidx] = b;\r
+            dst[1] = g;\r
+            dst[bidx^2] = r;\r
+        }\r
+    };\r
+    template<int HR> struct HSV2RGBConvertor<uchar, HR>\r
+    {\r
+        template <typename T>\r
+        static __device__ void cvt(const T& src, uchar* dst, int bidx)\r
+        {\r
+            float3 buf;\r
+\r
+            buf.x = src.x;\r
+            buf.y = src.y * (1.f/255.f);\r
+            buf.z = src.z * (1.f/255.f);\r
+\r
+            HSV2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);\r
+\r
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+        }\r
+    };\r
+\r
+    template <int SRCCN, int DSTCN, int HR, typename T>\r
+    __global__ void HSV2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)\r
+    {\r
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;\r
+        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;\r
+\r
+               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+        if (y < rows && x < cols)\r
+        {\r
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
+\r
+            dst_t dst;\r
+            HSV2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
+            \r
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
+        }\r
+    }\r
+}\r
+\r
+namespace cv { namespace gpu { namespace improc\r
+{\r
+    template <typename T, int SRCCN, int DSTCN>\r
+    void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        dim3 threads(32, 8, 1);\r
+        dim3 grid(1, 1, 1);\r
+\r
+        grid.x = divUp(src.cols, threads.x);\r
+        grid.y = divUp(src.rows, threads.y);\r
+\r
+        if (hrange == 180)\r
+            imgproc::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+        else\r
+            imgproc::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+\r
+    void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*RGB2HSV_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const RGB2HSV_caller_t RGB2HSV_callers[2][2] = \r
+        {\r
+            {RGB2HSV_caller<uchar, 3, 3>, RGB2HSV_caller<uchar, 3, 4>},\r
+            {RGB2HSV_caller<uchar, 4, 3>, RGB2HSV_caller<uchar, 4, 4>}\r
+        };\r
+\r
+        static const int div_table[] = \r
+        {\r
+            0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,\r
+            130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,\r
+            65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,\r
+            43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,\r
+            32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,\r
+            26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,\r
+            21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,\r
+            18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,\r
+            16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,\r
+            14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,\r
+            13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,\r
+            11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,\r
+            10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,\r
+            10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,\r
+            9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,\r
+            8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,\r
+            8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,\r
+            7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,\r
+            7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,\r
+            6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,\r
+            6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,\r
+            6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,\r
+            5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,\r
+            5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,\r
+            5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,\r
+            5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,\r
+            5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,\r
+            4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,\r
+            4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,\r
+            4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,\r
+            4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,\r
+            4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096\r
+        };\r
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvDivTable, div_table, sizeof(div_table)) );\r
+\r
+        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    void RGB2HSV_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*RGB2HSV_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const RGB2HSV_caller_t RGB2HSV_callers[2][2] = \r
+        {\r
+            {RGB2HSV_caller<float, 3, 3>, RGB2HSV_caller<float, 3, 4>},\r
+            {RGB2HSV_caller<float, 4, 3>, RGB2HSV_caller<float, 4, 4>}\r
+        };\r
+        \r
+        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    \r
+    template <typename T, int SRCCN, int DSTCN>\r
+    void HSV2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        dim3 threads(32, 8, 1);\r
+        dim3 grid(1, 1, 1);\r
+\r
+        grid.x = divUp(src.cols, threads.x);\r
+        grid.y = divUp(src.rows, threads.y);\r
+\r
+        if (hrange == 180)\r
+            imgproc::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+        else\r
+            imgproc::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+\r
+    void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*HSV2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const HSV2RGB_caller_t HSV2RGB_callers[2][2] = \r
+        {\r
+            {HSV2RGB_caller<uchar, 3, 3>, HSV2RGB_caller<uchar, 3, 4>},\r
+            {HSV2RGB_caller<uchar, 4, 3>, HSV2RGB_caller<uchar, 4, 4>}\r
+        };\r
+\r
+        static const int sector_data[][3] =\r
+            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
+\r
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );\r
+\r
+        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    void HSV2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*HSV2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const HSV2RGB_caller_t HSV2RGB_callers[2][2] = \r
+        {\r
+            {HSV2RGB_caller<float, 3, 3>, HSV2RGB_caller<float, 3, 4>},\r
+            {HSV2RGB_caller<float, 4, 3>, HSV2RGB_caller<float, 4, 4>}\r
+        };\r
+        \r
+        static const int sector_data[][3] =\r
+            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
+\r
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );\r
+        \r
+        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+}}}\r
+\r
 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////\r
-//\r
-//struct RGB2HLS_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)\r
-//    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, bidx = blueIdx, scn = srccn;\r
-//        float hscale = hrange*(1.f/360.f);\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            float b = src[bidx], g = src[1], r = src[bidx^2];\r
-//            float h = 0.f, s = 0.f, l;\r
-//            float vmin, vmax, diff;\r
-//\r
-//            vmax = vmin = r;\r
-//            if( vmax < g ) vmax = g;\r
-//            if( vmax < b ) vmax = b;\r
-//            if( vmin > g ) vmin = g;\r
-//            if( vmin > b ) vmin = b;\r
-//\r
-//            diff = vmax - vmin;\r
-//            l = (vmax + vmin)*0.5f;\r
-//\r
-//            if( diff > FLT_EPSILON )\r
-//            {\r
-//                s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);\r
-//                diff = 60.f/diff;\r
-//\r
-//                if( vmax == r )\r
-//                    h = (g - b)*diff;\r
-//                else if( vmax == g )\r
-//                    h = (b - r)*diff + 120.f;\r
-//                else\r
-//                    h = (r - g)*diff + 240.f;\r
-//\r
-//                if( h < 0.f ) h += 360.f;\r
-//            }\r
-//\r
-//            dst[i] = h*hscale;\r
-//            dst[i+1] = l;\r
-//            dst[i+2] = s;\r
-//        }\r
-//    }\r
-//\r
-//    int srccn, blueIdx;\r
-//    float hrange;\r
-//};\r
-//\r
-//\r
-//struct RGB2HLS_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)\r
-//    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, scn = srccn;\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, src += scn )\r
-//            {\r
-//                buf[j] = src[0]*(1.f/255.f);\r
-//                buf[j+1] = src[1]*(1.f/255.f);\r
-//                buf[j+2] = src[2]*(1.f/255.f);\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                dst[j] = saturate_cast<uchar>(buf[j]);\r
-//                dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);\r
-//                dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int srccn;\r
-//    RGB2HLS_f cvt;\r
-//};\r
-//\r
-//\r
-//struct HLS2RGB_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)\r
-//    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, bidx = blueIdx, dcn = dstcn;\r
-//        float _hscale = hscale;\r
-//        float alpha = ColorChannel<float>::max();\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, dst += dcn )\r
-//        {\r
-//            float h = src[i], l = src[i+1], s = src[i+2];\r
-//            float b, g, r;\r
-//\r
-//            if( s == 0 )\r
-//                b = g = r = l;\r
-//            else\r
-//            {\r
-//                static const int sector_data[][3]=\r
-//                {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
-//                float tab[4];\r
-//                int sector;\r
-//\r
-//                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;\r
-//                float p1 = 2*l - p2;\r
-//\r
-//                h *= _hscale;\r
-//                if( h < 0 )\r
-//                    do h += 6; while( h < 0 );\r
-//                else if( h >= 6 )\r
-//                    do h -= 6; while( h >= 6 );\r
-//\r
-//                assert( 0 <= h && h < 6 );\r
-//                sector = cvFloor(h);\r
-//                h -= sector;\r
-//\r
-//                tab[0] = p2;\r
-//                tab[1] = p1;\r
-//                tab[2] = p1 + (p2 - p1)*(1-h);\r
-//                tab[3] = p1 + (p2 - p1)*h;\r
-//\r
-//                b = tab[sector_data[sector][0]];\r
-//                g = tab[sector_data[sector][1]];\r
-//                r = tab[sector_data[sector][2]];\r
-//            }\r
-//\r
-//            dst[bidx] = b;\r
-//            dst[1] = g;\r
-//            dst[bidx^2] = r;\r
-//            if( dcn == 4 )\r
-//                dst[3] = alpha;\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn, blueIdx;\r
-//    float hscale;\r
-//};\r
-//\r
-//\r
-//struct HLS2RGB_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)\r
-//    : dstcn(_dstcn), cvt(3, _blueIdx, _hrange)\r
-//    {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, dcn = dstcn;\r
-//        uchar alpha = ColorChannel<uchar>::max();\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                buf[j] = src[j];\r
-//                buf[j+1] = src[j+1]*(1.f/255.f);\r
-//                buf[j+2] = src[j+2]*(1.f/255.f);\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, dst += dcn )\r
-//            {\r
-//                dst[0] = saturate_cast<uchar>(buf[j]*255.f);\r
-//                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);\r
-//                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);\r
-//                if( dcn == 4 )\r
-//                    dst[3] = alpha;\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    HLS2RGB_f cvt;\r
-//};\r
-//\r
-//\r
-/////////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////\r
-//\r
-//static const float D65[] = { 0.950456f, 1.f, 1.088754f };\r
-//\r
-//enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };\r
-//static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];\r
-//static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;\r
-//\r
-//static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];\r
-//static const float GammaTabScale = (float)GAMMA_TAB_SIZE;\r
-//\r
-//static unsigned short sRGBGammaTab_b[256], linearGammaTab_b[256];\r
-//#undef lab_shift\r
-//#define lab_shift xyz_shift\r
-//#define gamma_shift 3\r
-//#define lab_shift2 (lab_shift + gamma_shift)\r
-//#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))\r
-//static unsigned short LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];\r
-//\r
-//static void initLabTabs()\r
-//{\r
-//    static bool initialized = false;\r
-//    if(!initialized)\r
-//    {\r
-//        float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE], ig[GAMMA_TAB_SIZE], scale = 1.f/LabCbrtTabScale;\r
-//        int i;\r
-//        for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)\r
-//        {\r
-//            float x = i*scale;\r
-//            f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);\r
-//        }\r
-//        splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);\r
-//\r
-//        scale = 1.f/GammaTabScale;\r
-//        for(i = 0; i <= GAMMA_TAB_SIZE; i++)\r
-//        {\r
-//            float x = i*scale;\r
-//            g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4);\r
-//            ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*pow((double)x, 1./2.4) - 0.055);\r
-//        }\r
-//        splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);\r
-//        splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);\r
-//\r
-//        for(i = 0; i < 256; i++)\r
-//        {\r
-//            float x = i*(1.f/255.f);\r
-//            sRGBGammaTab_b[i] = saturate_cast<unsigned short>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));\r
-//            linearGammaTab_b[i] = (unsigned short)(i*(1 << gamma_shift));\r
-//        }\r
-//\r
-//        for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)\r
-//        {\r
-//            float x = i*(1.f/(255.f*(1 << gamma_shift)));\r
-//            LabCbrtTab_b[i] = saturate_cast<unsigned short>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));\r
-//        }\r
-//        initialized = true;\r
-//    }\r
-//}\r
-//\r
-//\r
-//struct RGB2Lab_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,\r
-//              const float* _whitept, bool _srgb)\r
-//    : srccn(_srccn), srgb(_srgb)\r
-//    {\r
-//        initLabTabs();\r
-//\r
-//        if(!_coeffs) _coeffs = sRGB2XYZ_D65;\r
-//        if(!_whitept) _whitept = D65;\r
-//        float scale[] =\r
-//        {\r
-//            (1 << lab_shift)/_whitept[0],\r
-//            (float)(1 << lab_shift),\r
-//            (1 << lab_shift)/_whitept[2]\r
-//        };\r
-//\r
-//        for( int i = 0; i < 3; i++ )\r
-//        {\r
-//            coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);\r
-//            coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);\r
-//            coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);\r
-//            CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&\r
-//                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );\r
-//        }\r
-//    }\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        const int Lscale = (116*255+50)/100;\r
-//        const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);\r
-//        const unsigned short* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;\r
-//        int i, scn = srccn;\r
-//        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\r
-//            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\r
-//            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];\r
-//            int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];\r
-//            int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];\r
-//            int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];\r
-//\r
-//            int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );\r
-//            int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );\r
-//            int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );\r
-//\r
-//            dst[i] = saturate_cast<uchar>(L);\r
-//            dst[i+1] = saturate_cast<uchar>(a);\r
-//            dst[i+2] = saturate_cast<uchar>(b);\r
-//        }\r
-//    }\r
-//\r
-//    int srccn;\r
-//    int coeffs[9];\r
-//    bool srgb;\r
-//};\r
-//\r
-//\r
-//struct RGB2Lab_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,\r
-//              const float* _whitept, bool _srgb)\r
-//    : srccn(_srccn), srgb(_srgb)\r
-//    {\r
-//        initLabTabs();\r
-//\r
-//        if(!_coeffs) _coeffs = sRGB2XYZ_D65;\r
-//        if(!_whitept) _whitept = D65;\r
-//        float scale[] = { LabCbrtTabScale/_whitept[0], LabCbrtTabScale, LabCbrtTabScale/_whitept[2] };\r
-//\r
-//        for( int i = 0; i < 3; i++ )\r
-//        {\r
-//            coeffs[i*3+(blueIdx^2)] = _coeffs[i*3]*scale[i];\r
-//            coeffs[i*3+1] = _coeffs[i*3+1]*scale[i];\r
-//            coeffs[i*3+blueIdx] = _coeffs[i*3+2]*scale[i];\r
-//            CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&\r
-//                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f*LabCbrtTabScale );\r
-//        }\r
-//    }\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, scn = srccn;\r
-//        float gscale = GammaTabScale;\r
-//        const float* gammaTab = srgb ? sRGBGammaTab : 0;\r
-//        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\r
-//              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\r
-//              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            float R = src[0], G = src[1], B = src[2];\r
-//            if( gammaTab )\r
-//            {\r
-//                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//            }\r
-//            float fX = splineInterpolate(R*C0 + G*C1 + B*C2, LabCbrtTab, LAB_CBRT_TAB_SIZE);\r
-//            float fY = splineInterpolate(R*C3 + G*C4 + B*C5, LabCbrtTab, LAB_CBRT_TAB_SIZE);\r
-//            float fZ = splineInterpolate(R*C6 + G*C7 + B*C8, LabCbrtTab, LAB_CBRT_TAB_SIZE);\r
-//\r
-//            float L = 116.f*fY - 16.f;\r
-//            float a = 500.f*(fX - fY);\r
-//            float b = 200.f*(fY - fZ);\r
-//\r
-//            dst[i] = L; dst[i+1] = a; dst[i+2] = b;\r
-//        }\r
-//    }\r
-//\r
-//    int srccn;\r
-//    float coeffs[9];\r
-//    bool srgb;\r
-//};\r
-//\r
-//\r
-//struct Lab2RGB_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,\r
-//               const float* _whitept, bool _srgb )\r
-//    : dstcn(_dstcn), srgb(_srgb)\r
-//    {\r
-//        initLabTabs();\r
-//\r
-//        if(!_coeffs) _coeffs = XYZ2sRGB_D65;\r
-//        if(!_whitept) _whitept = D65;\r
-//\r
-//        for( int i = 0; i < 3; i++ )\r
-//        {\r
-//            coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];\r
-//            coeffs[i+3] = _coeffs[i+3]*_whitept[i];\r
-//            coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];\r
-//        }\r
-//    }\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, dcn = dstcn;\r
-//        const float* gammaTab = srgb ? sRGBInvGammaTab : 0;\r
-//        float gscale = GammaTabScale;\r
-//        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\r
-//              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\r
-//              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\r
-//        float alpha = ColorChannel<float>::max();\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, dst += dcn )\r
-//        {\r
-//            float L = src[i], a = src[i+1], b = src[i+2];\r
-//            float Y = (L + 16.f)*(1.f/116.f);\r
-//            float X = (Y + a*0.002f);\r
-//            float Z = (Y - b*0.005f);\r
-//            Y = Y*Y*Y;\r
-//            X = X*X*X;\r
-//            Z = Z*Z*Z;\r
-//\r
-//            float R = X*C0 + Y*C1 + Z*C2;\r
-//            float G = X*C3 + Y*C4 + Z*C5;\r
-//            float B = X*C6 + Y*C7 + Z*C8;\r
-//\r
-//            if( gammaTab )\r
-//            {\r
-//                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//            }\r
-//\r
-//            dst[0] = R; dst[1] = G; dst[2] = B;\r
-//            if( dcn == 4 )\r
-//                dst[3] = alpha;\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    float coeffs[9];\r
-//    bool srgb;\r
-//};\r
-//\r
-//\r
-//struct Lab2RGB_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,\r
-//               const float* _whitept, bool _srgb )\r
-//    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, dcn = dstcn;\r
-//        uchar alpha = ColorChannel<uchar>::max();\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                buf[j] = src[j]*(100.f/255.f);\r
-//                buf[j+1] = (float)(src[j+1] - 128);\r
-//                buf[j+2] = (float)(src[j+2] - 128);\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, dst += dcn )\r
-//            {\r
-//                dst[0] = saturate_cast<uchar>(buf[j]*255.f);\r
-//                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);\r
-//                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);\r
-//                if( dcn == 4 )\r
-//                    dst[3] = alpha;\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    Lab2RGB_f cvt;\r
-//};\r
-//\r
-//\r
-/////////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////\r
-//\r
-//struct RGB2Luv_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,\r
-//               const float* whitept, bool _srgb )\r
-//    : srccn(_srccn), srgb(_srgb)\r
-//    {\r
-//        initLabTabs();\r
-//\r
-//        if(!_coeffs) _coeffs = sRGB2XYZ_D65;\r
-//        if(!whitept) whitept = D65;\r
-//\r
-//        for( int i = 0; i < 3; i++ )\r
-//        {\r
-//            coeffs[i*3+(blueIdx^2)] = _coeffs[i*3];\r
-//            coeffs[i*3+1] = _coeffs[i*3+1];\r
-//            coeffs[i*3+blueIdx] = _coeffs[i*3+2];\r
-//            CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&\r
-//                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );\r
-//        }\r
-//\r
-//        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);\r
-//        un = 4*whitept[0]*d;\r
-//        vn = 9*whitept[1]*d;\r
-//\r
-//        CV_Assert(whitept[1] == 1.f);\r
-//    }\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, scn = srccn;\r
-//        float gscale = GammaTabScale;\r
-//        const float* gammaTab = srgb ? sRGBGammaTab : 0;\r
-//        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\r
-//              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\r
-//              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\r
-//        float _un = 13*un, _vn = 13*vn;\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, src += scn )\r
-//        {\r
-//            float R = src[0], G = src[1], B = src[2];\r
-//            if( gammaTab )\r
-//            {\r
-//                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//            }\r
-//\r
-//            float X = R*C0 + G*C1 + B*C2;\r
-//            float Y = R*C3 + G*C4 + B*C5;\r
-//            float Z = R*C6 + G*C7 + B*C8;\r
-//\r
-//            float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);\r
-//            L = 116.f*L - 16.f;\r
-//\r
-//            float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);\r
-//            float u = L*(X*d - _un);\r
-//            float v = L*((9*0.25)*Y*d - _vn);\r
-//\r
-//            dst[i] = L; dst[i+1] = u; dst[i+2] = v;\r
-//        }\r
-//    }\r
-//\r
-//    int srccn;\r
-//    float coeffs[9], un, vn;\r
-//    bool srgb;\r
-//};\r
-//\r
-//\r
-//struct Luv2RGB_f\r
-//{\r
-//    typedef float channel_type;\r
-//\r
-//    Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,\r
-//              const float* whitept, bool _srgb )\r
-//    : dstcn(_dstcn), srgb(_srgb)\r
-//    {\r
-//        initLabTabs();\r
-//\r
-//        if(!_coeffs) _coeffs = XYZ2sRGB_D65;\r
-//        if(!whitept) whitept = D65;\r
-//\r
-//        for( int i = 0; i < 3; i++ )\r
-//        {\r
-//            coeffs[i+(blueIdx^2)*3] = _coeffs[i];\r
-//            coeffs[i+3] = _coeffs[i+3];\r
-//            coeffs[i+blueIdx*3] = _coeffs[i+6];\r
-//        }\r
-//\r
-//        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);\r
-//        un = 4*whitept[0]*d;\r
-//        vn = 9*whitept[1]*d;\r
-//\r
-//        CV_Assert(whitept[1] == 1.f);\r
-//    }\r
-//\r
-//    void operator()(const float* src, float* dst, int n) const\r
-//    {\r
-//        int i, dcn = dstcn;\r
-//        const float* gammaTab = srgb ? sRGBInvGammaTab : 0;\r
-//        float gscale = GammaTabScale;\r
-//        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\r
-//              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\r
-//              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\r
-//        float alpha = ColorChannel<float>::max();\r
-//        float _un = un, _vn = vn;\r
-//        n *= 3;\r
-//\r
-//        for( i = 0; i < n; i += 3, dst += dcn )\r
-//        {\r
-//            float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;\r
-//            Y = (L + 16.f) * (1.f/116.f);\r
-//            Y = Y*Y*Y;\r
-//            d = (1.f/13.f)/L;\r
-//            u = u*d + _un;\r
-//            v = v*d + _vn;\r
-//            float iv = 1.f/v;\r
-//            X = 2.25f * u * Y * iv ;\r
-//            Z = (12 - 3 * u - 20 * v) * Y * 0.25 * iv;\r
-//\r
-//            float R = X*C0 + Y*C1 + Z*C2;\r
-//            float G = X*C3 + Y*C4 + Z*C5;\r
-//            float B = X*C6 + Y*C7 + Z*C8;\r
-//\r
-//            if( gammaTab )\r
-//            {\r
-//                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);\r
-//            }\r
-//\r
-//            dst[0] = R; dst[1] = G; dst[2] = B;\r
-//            if( dcn == 4 )\r
-//                dst[3] = alpha;\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    float coeffs[9], un, vn;\r
-//    bool srgb;\r
-//};\r
-//\r
-//\r
-//struct RGB2Luv_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,\r
-//               const float* _whitept, bool _srgb )\r
-//    : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, scn = srccn;\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, src += scn )\r
-//            {\r
-//                buf[j] = src[0]*(1.f/255.f);\r
-//                buf[j+1] = (float)(src[1]*(1.f/255.f));\r
-//                buf[j+2] = (float)(src[2]*(1.f/255.f));\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                dst[j] = saturate_cast<uchar>(buf[j]*2.55f);\r
-//                dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);\r
-//                dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.99609375f + 139.453125f);\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int srccn;\r
-//    RGB2Luv_f cvt;\r
-//};\r
-//\r
-//\r
-//struct Luv2RGB_b\r
-//{\r
-//    typedef uchar channel_type;\r
-//\r
-//    Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,\r
-//               const float* _whitept, bool _srgb )\r
-//    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}\r
-//\r
-//    void operator()(const uchar* src, uchar* dst, int n) const\r
-//    {\r
-//        int i, j, dcn = dstcn;\r
-//        uchar alpha = ColorChannel<uchar>::max();\r
-//        float buf[3*BLOCK_SIZE];\r
-//\r
-//        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )\r
-//        {\r
-//            int dn = std::min(n - i, (int)BLOCK_SIZE);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3 )\r
-//            {\r
-//                buf[j] = src[j]*(100.f/255.f);\r
-//                buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);\r
-//                buf[j+2] = (float)(src[j+2]*1.003921568627451f - 140.f);\r
-//            }\r
-//            cvt(buf, buf, dn);\r
-//\r
-//            for( j = 0; j < dn*3; j += 3, dst += dcn )\r
-//            {\r
-//                dst[0] = saturate_cast<uchar>(buf[j]*255.f);\r
-//                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);\r
-//                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);\r
-//                if( dcn == 4 )\r
-//                    dst[3] = alpha;\r
-//            }\r
-//        }\r
-//    }\r
-//\r
-//    int dstcn;\r
-//    Luv2RGB_f cvt;\r
-//};\r
-//\r
-//\r
-////////////////////////////// Bayer Pattern -> RGB conversion /////////////////////////////\r
-//\r
-//static void Bayer2RGB_8u( const Mat& srcmat, Mat& dstmat, int code )\r
-//{\r
-//    const uchar* bayer0 = srcmat.data;\r
-//    int bayer_step = (int)srcmat.step;\r
-//    uchar* dst0 = dstmat.data;\r
-//    int dst_step = (int)dstmat.step;\r
-//    Size size = srcmat.size();\r
-//    int blue = code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ? -1 : 1;\r
-//    int start_with_green = code == CV_BayerGB2BGR || code == CV_BayerGR2BGR;\r
-//\r
-//    memset( dst0, 0, size.width*3*sizeof(dst0[0]) );\r
-//    memset( dst0 + (size.height - 1)*dst_step, 0, size.width*3*sizeof(dst0[0]) );\r
-//    dst0 += dst_step + 3 + 1;\r
-//    size.height -= 2;\r
-//    size.width -= 2;\r
-//\r
-//    for( ; size.height-- > 0; bayer0 += bayer_step, dst0 += dst_step )\r
-//    {\r
-//        int t0, t1;\r
-//        const uchar* bayer = bayer0;\r
-//        uchar* dst = dst0;\r
-//        const uchar* bayer_end = bayer + size.width;\r
-//\r
-//        dst[-4] = dst[-3] = dst[-2] = dst[size.width*3-1] =\r
-//            dst[size.width*3] = dst[size.width*3+1] = 0;\r
-//\r
-//        if( size.width <= 0 )\r
-//            continue;\r
-//\r
-//        if( start_with_green )\r
-//        {\r
-//            t0 = (bayer[1] + bayer[bayer_step*2+1] + 1) >> 1;\r
-//            t1 = (bayer[bayer_step] + bayer[bayer_step+2] + 1) >> 1;\r
-//            dst[-blue] = (uchar)t0;\r
-//            dst[0] = bayer[bayer_step+1];\r
-//            dst[blue] = (uchar)t1;\r
-//            bayer++;\r
-//            dst += 3;\r
-//        }\r
-//\r
-//        if( blue > 0 )\r
-//        {\r
-//            for( ; bayer <= bayer_end - 2; bayer += 2, dst += 6 )\r
-//            {\r
-//                t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +\r
-//                      bayer[bayer_step*2+2] + 2) >> 2;\r
-//                t1 = (bayer[1] + bayer[bayer_step] +\r
-//                      bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;\r
-//                dst[-1] = (uchar)t0;\r
-//                dst[0] = (uchar)t1;\r
-//                dst[1] = bayer[bayer_step+1];\r
-//\r
-//                t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;\r
-//                t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;\r
-//                dst[2] = (uchar)t0;\r
-//                dst[3] = bayer[bayer_step+2];\r
-//                dst[4] = (uchar)t1;\r
-//            }\r
-//        }\r
-//        else\r
-//        {\r
-//            for( ; bayer <= bayer_end - 2; bayer += 2, dst += 6 )\r
-//            {\r
-//                t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +\r
-//                      bayer[bayer_step*2+2] + 2) >> 2;\r
-//                t1 = (bayer[1] + bayer[bayer_step] +\r
-//                      bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;\r
-//                dst[1] = (uchar)t0;\r
-//                dst[0] = (uchar)t1;\r
-//                dst[-1] = bayer[bayer_step+1];\r
-//\r
-//                t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;\r
-//                t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;\r
-//                dst[4] = (uchar)t0;\r
-//                dst[3] = bayer[bayer_step+2];\r
-//                dst[2] = (uchar)t1;\r
-//            }\r
-//        }\r
-//\r
-//        if( bayer < bayer_end )\r
-//        {\r
-//            t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +\r
-//                  bayer[bayer_step*2+2] + 2) >> 2;\r
-//            t1 = (bayer[1] + bayer[bayer_step] +\r
-//                  bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;\r
-//            dst[-blue] = (uchar)t0;\r
-//            dst[0] = (uchar)t1;\r
-//            dst[blue] = bayer[bayer_step+1];\r
-//            bayer++;\r
-//            dst += 3;\r
-//        }\r
-//\r
-//        blue = -blue;\r
-//        start_with_green = !start_with_green;\r
-//    }\r
-//}\r
-//\r
-//\r
-///////////////////// Demosaicing using Variable Number of Gradients ///////////////////////\r
-//\r
-//static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )\r
-//{\r
-//    const uchar* bayer = srcmat.data;\r
-//    int bstep = (int)srcmat.step;\r
-//    uchar* dst = dstmat.data;\r
-//    int dststep = (int)dstmat.step;\r
-//    Size size = srcmat.size();\r
-//\r
-//    int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2;\r
-//    bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG;\r
-//\r
-//    // for too small images use the simple interpolation algorithm\r
-//    if( MIN(size.width, size.height) < 8 )\r
-//    {\r
-//        Bayer2RGB_8u( srcmat, dstmat, code );\r
-//        return;\r
-//    }\r
-//\r
-//    const int brows = 3, bcn = 7;\r
-//    int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;\r
-//    int i, bufstep = N7*bcn;\r
-//    cv::AutoBuffer<unsigned short> _buf(bufstep*brows);\r
-//    unsigned short* buf = (unsigned short*)_buf;\r
-//\r
-//    bayer += bstep*2;\r
-//\r
-//#if CV_SSE2\r
-//    bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);\r
-//    #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))\r
-//#endif\r
-//\r
-//    for( int y = 2; y < size.height - 4; y++ )\r
-//    {\r
-//        uchar* dstrow = dst + dststep*y + 6;\r
-//        const uchar* srow;\r
-//\r
-//        for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )\r
-//        {\r
-//            unsigned short* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;\r
-//            srow = bayer + (y+dy)*bstep + 1;\r
-//\r
-//            for( i = 0; i < bcn; i++ )\r
-//                brow[N*i-1] = brow[(N-2) + N*i] = 0;\r
-//\r
-//            i = 1;\r
-//\r
-//#if CV_SSE2\r
-//            if( haveSSE )\r
-//            {\r
-//                __m128i z = _mm_setzero_si128();\r
-//                for( ; i <= N-9; i += 8, srow += 8, brow += 8 )\r
-//                {\r
-//                    __m128i s1, s2, s3, s4, s6, s7, s8, s9;\r
-//\r
-//                    s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);\r
-//                    s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);\r
-//                    s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);\r
-//\r
-//                    s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);\r
-//                    s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);\r
-//\r
-//                    s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);\r
-//                    s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);\r
-//                    s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);\r
-//\r
-//                    __m128i b0, b1, b2, b3, b4, b5, b6;\r
-//\r
-//                    b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),\r
-//                                        _mm_adds_epu16(_mm_absdiff_epu16(s1, s7),\r
-//                                                       _mm_absdiff_epu16(s3, s9)));\r
-//                    b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1),\r
-//                                        _mm_adds_epu16(_mm_absdiff_epu16(s1, s3),\r
-//                                                       _mm_absdiff_epu16(s7, s9)));\r
-//                    b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);\r
-//                    b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);\r
-//\r
-//                    _mm_storeu_si128((__m128i*)brow, b0);\r
-//                    _mm_storeu_si128((__m128i*)(brow + N), b1);\r
-//                    _mm_storeu_si128((__m128i*)(brow + N2), b2);\r
-//                    _mm_storeu_si128((__m128i*)(brow + N3), b3);\r
-//\r
-//                    b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),\r
-//                                                          _mm_absdiff_epu16(s6, s8)));\r
-//                    b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),\r
-//                                                          _mm_absdiff_epu16(s4, s8)));\r
-//                    b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));\r
-//                    b6 = _mm_srli_epi16(b6, 1);\r
-//\r
-//                    _mm_storeu_si128((__m128i*)(brow + N4), b4);\r
-//                    _mm_storeu_si128((__m128i*)(brow + N5), b5);\r
-//                    _mm_storeu_si128((__m128i*)(brow + N6), b6);\r
-//                }\r
-//            }\r
-//#endif\r
-//\r
-//            for( ; i < N-1; i++, srow++, brow++ )\r
-//            {\r
-//                brow[0] = (unsigned short)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +\r
-//                                   std::abs(srow[-bstep] - srow[+bstep])*2 +\r
-//                                   std::abs(srow[1-bstep] - srow[1+bstep]));\r
-//                brow[N] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1-bstep]) +\r
-//                                   std::abs(srow[-1] - srow[1])*2 +\r
-//                                   std::abs(srow[-1+bstep] - srow[1+bstep]));\r
-//                brow[N2] = (unsigned short)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);\r
-//                brow[N3] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);\r
-//                brow[N4] = (unsigned short)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +\r
-//                                    std::abs(srow[+bstep] - srow[1]));\r
-//                brow[N5] = (unsigned short)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +\r
-//                                    std::abs(srow[+bstep] - srow[-1]));\r
-//                brow[N6] = (unsigned short)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);\r
-//            }\r
-//        }\r
-//\r
-//        const unsigned short* brow0 = buf + ((y - 2) % brows)*bufstep + 2;\r
-//        const unsigned short* brow1 = buf + ((y - 1) % brows)*bufstep + 2;\r
-//        const unsigned short* brow2 = buf + (y % brows)*bufstep + 2;\r
-//        static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };\r
-//        srow = bayer + y*bstep + 2;\r
-//        bool greenCell = greenCell0;\r
-//\r
-//        i = 2;\r
-//#if CV_SSE2\r
-//        int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;\r
-//#else\r
-//        int limit = N - 2;\r
-//#endif\r
-//\r
-//        do\r
-//        {\r
-//            for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 )\r
-//            {\r
-//                int gradN = brow0[0] + brow1[0];\r
-//                int gradS = brow1[0] + brow2[0];\r
-//                int gradW = brow1[N-1] + brow1[N];\r
-//                int gradE = brow1[N] + brow1[N+1];\r
-//                int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);\r
-//                int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);\r
-//                int R, G, B;\r
-//\r
-//                if( !greenCell )\r
-//                {\r
-//                    int gradNE = brow0[N4+1] + brow1[N4];\r
-//                    int gradSW = brow1[N4] + brow2[N4-1];\r
-//                    int gradNW = brow0[N5-1] + brow1[N5];\r
-//                    int gradSE = brow1[N5] + brow2[N5+1];\r
-//\r
-//                    minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);\r
-//                    maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);\r
-//                    int T = minGrad + maxGrad/2;\r
-//\r
-//                    int Rs = 0, Gs = 0, Bs = 0, ng = 0;\r
-//                    if( gradN < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2] + srow[0];\r
-//                        Gs += srow[-bstep]*2;\r
-//                        Bs += srow[-bstep-1] + srow[-bstep+1];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradS < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2] + srow[0];\r
-//                        Gs += srow[bstep]*2;\r
-//                        Bs += srow[bstep-1] + srow[bstep+1];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradW < T )\r
-//                    {\r
-//                        Rs += srow[-2] + srow[0];\r
-//                        Gs += srow[-1]*2;\r
-//                        Bs += srow[-bstep-1] + srow[bstep-1];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradE < T )\r
-//                    {\r
-//                        Rs += srow[2] + srow[0];\r
-//                        Gs += srow[1]*2;\r
-//                        Bs += srow[-bstep+1] + srow[bstep+1];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradNE < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2+2] + srow[0];\r
-//                        Gs += brow0[N6+1];\r
-//                        Bs += srow[-bstep+1]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradSW < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2-2] + srow[0];\r
-//                        Gs += brow2[N6-1];\r
-//                        Bs += srow[bstep-1]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradNW < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2-2] + srow[0];\r
-//                        Gs += brow0[N6-1];\r
-//                        Bs += srow[-bstep+1]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradSE < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2+2] + srow[0];\r
-//                        Gs += brow2[N6+1];\r
-//                        Bs += srow[-bstep+1]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    R = srow[0];\r
-//                    G = R + cvRound((Gs - Rs)*scale[ng]);\r
-//                    B = R + cvRound((Bs - Rs)*scale[ng]);\r
-//                }\r
-//                else\r
-//                {\r
-//                    int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];\r
-//                    int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];\r
-//                    int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];\r
-//                    int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];\r
-//\r
-//                    minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);\r
-//                    maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);\r
-//                    int T = minGrad + maxGrad/2;\r
-//\r
-//                    int Rs = 0, Gs = 0, Bs = 0, ng = 0;\r
-//                    if( gradN < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2-1] + srow[-bstep*2+1];\r
-//                        Gs += srow[-bstep*2] + srow[0];\r
-//                        Bs += srow[-bstep]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradS < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2-1] + srow[bstep*2+1];\r
-//                        Gs += srow[bstep*2] + srow[0];\r
-//                        Bs += srow[bstep]*2;\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradW < T )\r
-//                    {\r
-//                        Rs += srow[-1]*2;\r
-//                        Gs += srow[-2] + srow[0];\r
-//                        Bs += srow[-bstep-2]+srow[bstep-2];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradE < T )\r
-//                    {\r
-//                        Rs += srow[1]*2;\r
-//                        Gs += srow[2] + srow[0];\r
-//                        Bs += srow[-bstep+2]+srow[bstep+2];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradNE < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2+1] + srow[1];\r
-//                        Gs += srow[-bstep+1]*2;\r
-//                        Bs += srow[-bstep] + srow[-bstep+2];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradSW < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2-1] + srow[-1];\r
-//                        Gs += srow[bstep-1]*2;\r
-//                        Bs += srow[bstep] + srow[bstep-2];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradNW < T )\r
-//                    {\r
-//                        Rs += srow[-bstep*2-1] + srow[-1];\r
-//                        Gs += srow[-bstep-1]*2;\r
-//                        Bs += srow[-bstep-2]+srow[-bstep];\r
-//                        ng++;\r
-//                    }\r
-//                    if( gradSE < T )\r
-//                    {\r
-//                        Rs += srow[bstep*2+1] + srow[1];\r
-//                        Gs += srow[bstep+1]*2;\r
-//                        Bs += srow[bstep+2]+srow[bstep];\r
-//                        ng++;\r
-//                    }\r
-//                    G = srow[0];\r
-//                    R = G + cvRound((Rs - Gs)*scale[ng]);\r
-//                    B = G + cvRound((Bs - Gs)*scale[ng]);\r
-//                }\r
-//                dstrow[blueIdx] = CV_CAST_8U(B);\r
-//                dstrow[1] = CV_CAST_8U(G);\r
-//                dstrow[blueIdx^2] = CV_CAST_8U(R);\r
-//                greenCell = !greenCell;\r
-//            }\r
-//\r
-//#if CV_SSE2\r
-//            if( !haveSSE )\r
-//                break;\r
-//\r
-//            __m128i emask = _mm_set1_epi32(0x0000ffff),\r
-//            omask = _mm_set1_epi32(0xffff0000),\r
-//            z = _mm_setzero_si128();\r
-//            __m128 _0_5 = _mm_set1_ps(0.5f);\r
-//\r
-//            #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))\r
-//            #define _mm_cvtloepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))\r
-//            #define _mm_cvthiepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))\r
-//\r
-//            // process 8 pixels at once\r
-//            for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )\r
-//            {\r
-//                __m128i gradN, gradS, gradW, gradE, gradNE, gradSW, gradNW, gradSE;\r
-//                gradN = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow0),\r
-//                                       _mm_loadu_si128((__m128i*)brow1));\r
-//                gradS = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow1),\r
-//                                       _mm_loadu_si128((__m128i*)brow2));\r
-//                gradW = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N-1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N)));\r
-//                gradE = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N+1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N)));\r
-//\r
-//                __m128i minGrad, maxGrad, T;\r
-//                minGrad = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(gradN, gradS), gradW), gradE);\r
-//                maxGrad = _mm_max_epi16(_mm_max_epi16(_mm_max_epi16(gradN, gradS), gradW), gradE);\r
-//\r
-//                __m128i grad0, grad1;\r
-//\r
-//                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N4+1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N4)));\r
-//                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N2)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow0+N2+1))),\r
-//                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow1+N2+1))));\r
-//                gradNE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);\r
-//\r
-//                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N4-1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N4)));\r
-//                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N2)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow2+N2-1))),\r
-//                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow1+N2-1))));\r
-//                gradSW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);\r
-//\r
-//                minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);\r
-//                maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);\r
-//\r
-//                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N5-1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N5)));\r
-//                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N3)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow0+N3-1))),\r
-//                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow1+N3-1))));\r
-//                gradNW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);\r
-//\r
-//                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N5+1)),\r
-//                                       _mm_loadu_si128((__m128i*)(brow1+N5)));\r
-//                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N3)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow2+N3+1))),\r
-//                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),\r
-//                                                      _mm_loadu_si128((__m128i*)(brow1+N3+1))));\r
-//                gradSE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);\r
-//\r
-//                minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);\r
-//                maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);\r
-//\r
-//                T = _mm_add_epi16(_mm_srli_epi16(maxGrad, 1), minGrad);\r
-//                __m128i RGs = z, GRs = z, Bs = z, ng = z, mask;\r
-//\r
-//                __m128i t0, t1, x0, x1, x2, x3, x4, x5, x6, x7, x8,\r
-//                x9, x10, x11, x12, x13, x14, x15, x16;\r
-//\r
-//                x0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)srow), z);\r
-//\r
-//                x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-1)), z);\r
-//                x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-1)), z);\r
-//                x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)), z);\r
-//                x4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+1)), z);\r
-//                x5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+1)), z);\r
-//                x6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+2)), z);\r
-//                x7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)), z);\r
-//                x8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+2)), z);\r
-//                x9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+1)), z);\r
-//                x10 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+1)), z);\r
-//                x11 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)), z);\r
-//                x12 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-1)), z);\r
-//                x13 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-1)), z);\r
-//                x14 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-2)), z);\r
-//                x15 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)), z);\r
-//                x16 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-2)), z);\r
-//\r
-//                // gradN\r
-//                mask = _mm_cmpgt_epi16(T, gradN);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x3, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x2,x4)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x5), t0), mask));\r
-//\r
-//                // gradNE\r
-//                mask = _mm_cmpgt_epi16(T, gradNE);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x5, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)),\r
-//                                                                        _mm_adds_epu16(x4,x7)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x6)), mask));\r
-//\r
-//                // gradE\r
-//                mask = _mm_cmpgt_epi16(T, gradE);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x7, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x5,x9),\r
-//                                                                      _mm_adds_epu16(x6,x8)), mask));\r
-//\r
-//                // gradSE\r
-//                mask = _mm_cmpgt_epi16(T, gradSE);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x9, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)),\r
-//                                                                        _mm_adds_epu16(x7,x10)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x8,x11)), mask));\r
-//\r
-//                // gradS\r
-//                mask = _mm_cmpgt_epi16(T, gradS);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x11, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x10,x12)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x9,x13), t0), mask));\r
-//\r
-//                // gradSW\r
-//                mask = _mm_cmpgt_epi16(T, gradSW);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x13, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)),\r
-//                                                                        _mm_adds_epu16(x12,x15)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x11,x14)), mask));\r
-//\r
-//                // gradW\r
-//                mask = _mm_cmpgt_epi16(T, gradW);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                t0 = _mm_slli_epi16(x15, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x13),\r
-//                                                                      _mm_adds_epu16(x14,x16)), mask));\r
-//\r
-//                // gradNW\r
-//                mask = _mm_cmpgt_epi16(T, gradNW);\r
-//                ng = _mm_sub_epi16(ng, mask);\r
-//\r
-//                __m128 ngf0, ngf1;\r
-//                ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));\r
-//                ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));\r
-//\r
-//                t0 = _mm_slli_epi16(x1, 1);\r
-//                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-2)), z), x0);\r
-//\r
-//                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));\r
-//                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)),\r
-//                                                                        _mm_adds_epu16(x2,x15)), mask));\r
-//                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x16)), mask));\r
-//\r
-//                // now interpolate r, g & b\r
-//                t0 = _mm_sub_epi16(GRs, RGs);\r
-//                t1 = _mm_sub_epi16(Bs, RGs);\r
-//\r
-//                t0 = _mm_add_epi16(x0, _mm_packs_epi32(\r
-//                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),\r
-//                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));\r
-//\r
-//                t1 = _mm_add_epi16(x0, _mm_packs_epi32(\r
-//                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),\r
-//                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));\r
-//\r
-//                x1 = _mm_merge_epi16(x0, t0);\r
-//                x2 = _mm_merge_epi16(t0, x0);\r
-//\r
-//                uchar R[8], G[8], B[8];\r
-//\r
-//                _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));\r
-//                _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));\r
-//                _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));\r
-//\r
-//                for( int j = 0; j < 8; j++, dstrow += 3 )\r
-//                {\r
-//                    dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j];\r
-//                }\r
-//            }\r
-//#endif\r
-//\r
-//            limit = N - 2;\r
-//        }\r
-//        while( i < N - 2 );\r
-//\r
-//        for( i = 0; i < 6; i++ )\r
-//        {\r
-//            dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i];\r
-//            dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i];\r
-//        }\r
-//\r
-//        greenCell0 = !greenCell0;\r
-//        blueIdx ^= 2;\r
-//    }\r
-//\r
-//    for( i = 0; i < size.width*3; i++ )\r
-//    {\r
-//        dst[i] = dst[i + dststep] = dst[i + dststep*2];\r
-//        dst[i + dststep*(size.height-4)] =\r
-//        dst[i + dststep*(size.height-3)] =\r
-//        dst[i + dststep*(size.height-2)] =\r
-//        dst[i + dststep*(size.height-1)] = dst[i + dststep*(size.height-5)];\r
-//    }\r
-//}\r
+\r
+namespace imgproc\r
+{\r
+    template<typename T, int HR> struct RGB2HLSConvertor;\r
+    template<int HR> struct RGB2HLSConvertor<float, HR>\r
+    {\r
+        template <typename D>\r
+        static __device__ void cvt(const float* src, D& dst, int bidx)\r
+        {\r
+            const float hscale = HR * (1.f/360.f);\r
+\r
+            float b = src[bidx], g = src[1], r = src[bidx^2];\r
+            float h = 0.f, s = 0.f, l;\r
+            float vmin, vmax, diff;\r
+\r
+            vmax = vmin = r;\r
+            vmax = fmax(vmax, g);\r
+            vmax = fmax(vmax, b);\r
+            vmin = fmin(vmin, g);\r
+            vmin = fmin(vmin, b);\r
+\r
+            diff = vmax - vmin;\r
+            l = (vmax + vmin) * 0.5f;\r
+\r
+            if (diff > FLT_EPSILON)\r
+            {\r
+                s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);\r
+                diff = 60.f / diff;\r
+\r
+                if (vmax == r)\r
+                    h = (g - b)*diff;\r
+                else if (vmax == g)\r
+                    h = (b - r)*diff + 120.f;\r
+                else\r
+                    h = (r - g)*diff + 240.f;\r
+\r
+                if (h < 0.f) h += 360.f;\r
+            }\r
+\r
+            dst.x = h * hscale;\r
+            dst.y = l;\r
+            dst.z = s;\r
+        }\r
+    };\r
+    template<int HR> struct RGB2HLSConvertor<uchar, HR>\r
+    {\r
+        template <typename D>\r
+        static __device__ void cvt(const uchar* src, D& dst, int bidx)\r
+        {\r
+            float3 buf;\r
+\r
+            buf.x = src[0]*(1.f/255.f);\r
+            buf.y = src[1]*(1.f/255.f);\r
+            buf.z = src[2]*(1.f/255.f);\r
+\r
+            RGB2HLSConvertor<float, HR>::cvt(&buf.x, buf, bidx);\r
+\r
+            dst.x = saturate_cast<uchar>(buf.x);\r
+            dst.y = saturate_cast<uchar>(buf.y*255.f);\r
+            dst.z = saturate_cast<uchar>(buf.z*255.f);\r
+        }\r
+    };\r
+\r
+    template <int SRCCN, int DSTCN, int HR, typename T>\r
+    __global__ void RGB2HLS(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)\r
+    {\r
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;\r
+        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;\r
+\r
+               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+        if (y < rows && x < cols)\r
+        {\r
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
+\r
+            dst_t dst;\r
+            RGB2HLSConvertor<T, HR>::cvt(&src.x, dst, bidx);\r
+            \r
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
+        }\r
+    }\r
+    \r
+    __constant__ int cHlsSectorData[6][3];\r
+\r
+    template<typename T, int HR> struct HLS2RGBConvertor;    \r
+    template<int HR> struct HLS2RGBConvertor<float, HR>\r
+    {\r
+        template <typename T>\r
+        static __device__ void cvt(const T& src, float* dst, int bidx)\r
+        {\r
+            const float hscale = 6.0f / HR;\r
+\r
+            float h = src.x, l = src.y, s = src.z;\r
+            float b, g, r;\r
+\r
+            if (s == 0)\r
+                b = g = r = l;\r
+            else\r
+            {\r
+                float tab[4];\r
+                int sector;\r
+\r
+                float p2 = l <= 0.5f ? l * (1 + s) : l + s - l * s;\r
+                float p1 = 2 * l - p2;\r
+\r
+                h *= hscale;\r
+\r
+                if( h < 0 )\r
+                    do h += 6; while( h < 0 );\r
+                else if( h >= 6 )\r
+                    do h -= 6; while( h >= 6 );\r
+\r
+                sector = __float2int_rd(h);\r
+                h -= sector;\r
+\r
+                tab[0] = p2;\r
+                tab[1] = p1;\r
+                tab[2] = p1 + (p2 - p1) * (1 - h);\r
+                tab[3] = p1 + (p2 - p1) * h;\r
+\r
+                b = tab[cHlsSectorData[sector][0]];\r
+                g = tab[cHlsSectorData[sector][1]];\r
+                r = tab[cHlsSectorData[sector][2]];\r
+            }\r
+\r
+            dst[bidx] = b;\r
+            dst[1] = g;\r
+            dst[bidx^2] = r;\r
+        }\r
+    };\r
+    template<int HR> struct HLS2RGBConvertor<uchar, HR>\r
+    {\r
+        template <typename T>\r
+        static __device__ void cvt(const T& src, uchar* dst, int bidx)\r
+        {\r
+            float3 buf;\r
+\r
+            buf.x = src.x;\r
+            buf.y = src.y*(1.f/255.f);\r
+            buf.z = src.z*(1.f/255.f);\r
+\r
+            HLS2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);\r
+\r
+            dst[0] = saturate_cast<uchar>(buf.x*255.f);\r
+            dst[1] = saturate_cast<uchar>(buf.y*255.f);\r
+            dst[2] = saturate_cast<uchar>(buf.z*255.f);\r
+        }\r
+    };\r
+\r
+    template <int SRCCN, int DSTCN, int HR, typename T>\r
+    __global__ void HLS2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)\r
+    {\r
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;\r
+        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;\r
+\r
+               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+        if (y < rows && x < cols)\r
+        {\r
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));\r
+\r
+            dst_t dst;\r
+            HLS2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
+            \r
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;\r
+        }\r
+    }\r
+}\r
+\r
+namespace cv { namespace gpu { namespace improc\r
+{\r
+    template <typename T, int SRCCN, int DSTCN>\r
+    void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        dim3 threads(32, 8, 1);\r
+        dim3 grid(1, 1, 1);\r
+\r
+        grid.x = divUp(src.cols, threads.x);\r
+        grid.y = divUp(src.rows, threads.y);\r
+\r
+        if (hrange == 180)\r
+            imgproc::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+        else\r
+            imgproc::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+\r
+    void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*RGB2HLS_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const RGB2HLS_caller_t RGB2HLS_callers[2][2] = \r
+        {\r
+            {RGB2HLS_caller<uchar, 3, 3>, RGB2HLS_caller<uchar, 3, 4>},\r
+            {RGB2HLS_caller<uchar, 4, 3>, RGB2HLS_caller<uchar, 4, 4>}\r
+        };\r
+\r
+        RGB2HLS_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    void RGB2HLS_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*RGB2HLS_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const RGB2HLS_caller_t RGB2HLS_callers[2][2] = \r
+        {\r
+            {RGB2HLS_caller<float, 3, 3>, RGB2HLS_caller<float, 3, 4>},\r
+            {RGB2HLS_caller<float, 4, 3>, RGB2HLS_caller<float, 4, 4>}\r
+        };\r
+        \r
+        RGB2HLS_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    \r
+    template <typename T, int SRCCN, int DSTCN>\r
+    void HLS2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        dim3 threads(32, 8, 1);\r
+        dim3 grid(1, 1, 1);\r
+\r
+        grid.x = divUp(src.cols, threads.x);\r
+        grid.y = divUp(src.rows, threads.y);\r
+\r
+        if (hrange == 180)\r
+            imgproc::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+        else\r
+            imgproc::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, \r
+                dst.ptr, dst.step, src.rows, src.cols, bidx);\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+\r
+    void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*HLS2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const HLS2RGB_caller_t HLS2RGB_callers[2][2] = \r
+        {\r
+            {HLS2RGB_caller<uchar, 3, 3>, HLS2RGB_caller<uchar, 3, 4>},\r
+            {HLS2RGB_caller<uchar, 4, 3>, HLS2RGB_caller<uchar, 4, 4>}\r
+        };\r
+        \r
+        static const int sector_data[][3]=\r
+            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
+\r
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );\r
+\r
+        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+\r
+    void HLS2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)\r
+    {\r
+        typedef void (*HLS2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);\r
+        static const HLS2RGB_caller_t HLS2RGB_callers[2][2] = \r
+        {\r
+            {HLS2RGB_caller<float, 3, 3>, HLS2RGB_caller<float, 3, 4>},\r
+            {HLS2RGB_caller<float, 4, 3>, HLS2RGB_caller<float, 4, 4>}\r
+        };\r
+        \r
+        static const int sector_data[][3]=\r
+            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};\r
+\r
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );\r
+                \r
+        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);\r
+    }\r
+}}}\r
index 84d9516..14df108 100644 (file)
@@ -105,21 +105,33 @@ namespace cv { namespace gpu
         void RGB2Gray_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
         void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream);\r
 \r
-        void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream);\r
-        void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream);\r
-        void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream);\r
+        void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
+        void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
+        void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
 \r
-        void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream);\r
-        void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream);\r
-        void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream);\r
+        void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
+        void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
+        void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);\r
 \r
-        void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream);\r
-        void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream);\r
-        void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream);\r
+        void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
+        void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
+        void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
 \r
-        void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream);\r
-        void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream);\r
-        void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream);\r
+        void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
+        void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
+        void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);\r
+\r
+        void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+        void RGB2HSV_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+\r
+        void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+        void HSV2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+\r
+        void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+        void RGB2HLS_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+\r
+        void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
+        void HLS2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);\r
     }\r
 }}\r
 \r
@@ -294,109 +306,122 @@ namespace
         \r
         CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);\r
 \r
-        GpuMat out;\r
-        if (dst.data != src.data)\r
-            out = dst;\r
-\r
-        NppiSize nppsz;\r
-        nppsz.height = src.rows;\r
-        nppsz.width = src.cols;\r
-\r
         switch (code)\r
         {\r
             case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:\r
-            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:\r
-                CV_Assert(scn == 3 || scn == 4);\r
+            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:                \r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::RGB2RGB_gpu_8u, 0, improc::RGB2RGB_gpu_16u, 0, 0, improc::RGB2RGB_gpu_32f};\r
 \r
-                dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;\r
-                bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;\r
-                \r
-                out.create(sz, CV_MAKETYPE(depth, dcn));\r
-                if( depth == CV_8U )\r
-                    improc::RGB2RGB_gpu_8u(src, scn, out, dcn, bidx, stream);\r
-                else if( depth == CV_16U )\r
-                    improc::RGB2RGB_gpu_16u(src, scn, out, dcn, bidx, stream);\r
-                else\r
-                    improc::RGB2RGB_gpu_32f(src, scn, out, dcn, bidx, stream);\r
-                break;\r
+                    CV_Assert(scn == 3 || scn == 4);\r
+\r
+                    dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;\r
+                    bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;\r
+                    \r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    funcs[depth](src, scn, dst, dcn, bidx, stream);\r
+                    break;\r
+                }\r
                 \r
             case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:\r
             case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:\r
-                CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );\r
-                out.create(sz, CV_8UC2);\r
-\r
-                improc::RGB2RGB5x5_gpu(src, scn, out, code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||\r
-                          code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5,\r
-                          code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||\r
-                          code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,\r
-                          stream);\r
-                break;\r
+                {\r
+                    CV_Assert((scn == 3 || scn == 4) && depth == CV_8U);\r
+\r
+                    int green_bits = code == CV_BGR2BGR565 || code == CV_RGB2BGR565 \r
+                        || code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5;\r
+                    bidx = code == CV_BGR2BGR565 || code == CV_BGR2BGR555 \r
+                        || code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2;\r
+\r
+                    dst.create(sz, CV_8UC2);\r
+\r
+                    improc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);\r
+                    break;\r
+                }\r
             \r
             case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:\r
             case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:\r
-                if(dcn <= 0) dcn = 3;\r
-                CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );\r
-                out.create(sz, CV_MAKETYPE(depth, dcn));\r
-\r
-                improc::RGB5x52RGB_gpu(src, \r
-                          code == CV_BGR5652BGR || code == CV_BGR5652RGB ||\r
-                          code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5, \r
-                          out, dcn,\r
-                          code == CV_BGR5652BGR || code == CV_BGR5552BGR ||\r
-                          code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2,\r
-                          stream);\r
-                break;\r
+                {\r
+                    if (dcn <= 0) dcn = 3;\r
+\r
+                    CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);\r
+\r
+                    int green_bits = code == CV_BGR5652BGR || code == CV_BGR5652RGB \r
+                        || code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5;\r
+                    bidx = code == CV_BGR5652BGR || code == CV_BGR5552BGR \r
+                        || code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2;\r
+\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    improc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);\r
+                    break;\r
+                }\r
                         \r
             case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:\r
-                CV_Assert(scn == 3 || scn == 4);\r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::RGB2Gray_gpu_8u, 0, improc::RGB2Gray_gpu_16u, 0, 0, improc::RGB2Gray_gpu_32f};\r
 \r
-                out.create(sz, CV_MAKETYPE(depth, 1));\r
-                bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;\r
-                \r
-                if( depth == CV_8U )\r
-                    improc::RGB2Gray_gpu_8u(src, scn, out, bidx, stream);\r
-                else if( depth == CV_16U )\r
-                    improc::RGB2Gray_gpu_16u(src, scn, out, bidx, stream);\r
-                else\r
-                    improc::RGB2Gray_gpu_32f(src, scn, out, bidx, stream);\r
-                break;\r
+                    CV_Assert(scn == 3 || scn == 4);\r
+                    \r
+                    bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;\r
+\r
+                    dst.create(sz, CV_MAKETYPE(depth, 1));\r
+\r
+                    funcs[depth](src, scn, dst, bidx, stream);\r
+                    break;\r
+                }\r
             \r
             case CV_BGR5652GRAY: case CV_BGR5552GRAY:\r
-                CV_Assert( scn == 2 && depth == CV_8U );\r
+                {\r
+                    CV_Assert(scn == 2 && depth == CV_8U);\r
+\r
+                    int green_bits = code == CV_BGR5652GRAY ? 6 : 5;\r
 \r
-                out.create(sz, CV_8UC1);\r
+                    dst.create(sz, CV_8UC1);\r
 \r
-                improc::RGB5x52Gray_gpu(src, code == CV_BGR5652GRAY ? 6 : 5, out, stream);\r
-                break;\r
+                    improc::RGB5x52Gray_gpu(src, green_bits, dst, stream);\r
+                    break;\r
+                }\r
             \r
             case CV_GRAY2BGR: case CV_GRAY2BGRA:\r
-                if (dcn <= 0) \r
-                    dcn = 3;\r
-                CV_Assert(scn == 1 && (dcn == 3 || dcn == 4));\r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::Gray2RGB_gpu_8u, 0, improc::Gray2RGB_gpu_16u, 0, 0, improc::Gray2RGB_gpu_32f};\r
 \r
-                out.create(sz, CV_MAKETYPE(depth, dcn));\r
-                \r
-                if( depth == CV_8U )\r
-                    improc::Gray2RGB_gpu_8u(src, out, dcn, stream);\r
-                else if( depth == CV_16U )\r
-                    improc::Gray2RGB_gpu_16u(src, out, dcn, stream);\r
-                else\r
-                    improc::Gray2RGB_gpu_32f(src, out, dcn, stream);\r
-                break;\r
+                    if (dcn <= 0) dcn = 3;\r
+\r
+                    CV_Assert(scn == 1 && (dcn == 3 || dcn == 4));\r
+\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    funcs[depth](src, dst, dcn, stream);\r
+                    break;\r
+                }\r
                 \r
             case CV_GRAY2BGR565: case CV_GRAY2BGR555:\r
-                CV_Assert( scn == 1 && depth == CV_8U );\r
+                {\r
+                    CV_Assert(scn == 1 && depth == CV_8U);\r
 \r
-                out.create(sz, CV_8UC2);\r
-                \r
-                improc::Gray2RGB5x5_gpu(src, out, code == CV_GRAY2BGR565 ? 6 : 5, stream);\r
-                break;\r
+                    int green_bits =  code == CV_GRAY2BGR565 ? 6 : 5;\r
+\r
+                    dst.create(sz, CV_8UC2);\r
+                    \r
+                    improc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);\r
+                    break;\r
+                }\r
 \r
             case CV_BGR2YCrCb: case CV_RGB2YCrCb:\r
             case CV_BGR2YUV: case CV_RGB2YUV:\r
                 {\r
-                    if(dcn <= 0) dcn = 3;\r
-                    CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) );\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, \r
+                        const void* coeffs, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::RGB2YCrCb_gpu_8u, 0, improc::RGB2YCrCb_gpu_16u, 0, 0, improc::RGB2YCrCb_gpu_32f};\r
+\r
+                    if (dcn <= 0) dcn = 3;\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));\r
 \r
                     bidx = code == CV_BGR2YCrCb || code == CV_RGB2YUV ? 0 : 2;\r
 \r
@@ -408,32 +433,33 @@ namespace
 \r
                     float coeffs_f[5];\r
                     int coeffs_i[5];\r
-                    ::memcpy(coeffs_f, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_f : yuv_f, 5 * sizeof(float));\r
-                    ::memcpy(coeffs_i, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_i : yuv_i, 5 * sizeof(int));\r
+                    ::memcpy(coeffs_f, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_f : yuv_f, sizeof(yuv_f));\r
+                    ::memcpy(coeffs_i, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_i : yuv_i, sizeof(yuv_i));\r
 \r
-                    if (bidx==0) \r
+                    if (bidx == 0) \r
                     {\r
                         std::swap(coeffs_f[0], coeffs_f[2]);\r
                         std::swap(coeffs_i[0], coeffs_i[2]);\r
                     }\r
                         \r
-                    out.create(sz, CV_MAKETYPE(depth, dcn));\r
-                    \r
-                    if( depth == CV_8U )\r
-                        improc::RGB2YCrCb_gpu_8u(src, scn, out, dcn, bidx, coeffs_i, stream);\r
-                    else if( depth == CV_16U )\r
-                        improc::RGB2YCrCb_gpu_16u(src, scn, out, dcn, bidx, coeffs_i, stream);\r
-                    else\r
-                        improc::RGB2YCrCb_gpu_32f(src, scn, out, dcn, bidx, coeffs_f, stream);\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;\r
+\r
+                    funcs[depth](src, scn, dst, dcn, bidx, coeffs, stream);\r
+                    break;\r
                 }\r
-                break;\r
                 \r
             case CV_YCrCb2BGR: case CV_YCrCb2RGB:\r
             case CV_YUV2BGR: case CV_YUV2RGB:\r
                 {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, \r
+                        const void* coeffs, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::YCrCb2RGB_gpu_8u, 0, improc::YCrCb2RGB_gpu_16u, 0, 0, improc::YCrCb2RGB_gpu_32f};\r
+\r
                     if (dcn <= 0) dcn = 3;\r
 \r
-                    CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) );\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));\r
 \r
                     bidx = code == CV_YCrCb2BGR || code == CV_YUV2RGB ? 0 : 2;\r
 \r
@@ -446,21 +472,23 @@ namespace
                     const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? YCrCb_f : yuv_f;\r
                     const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? YCrCb_i : yuv_i;\r
                     \r
-                    out.create(sz, CV_MAKETYPE(depth, dcn));\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
                     \r
-                    if( depth == CV_8U )\r
-                        improc::YCrCb2RGB_gpu_8u(src, scn, out, dcn, bidx, coeffs_i, stream);\r
-                    else if( depth == CV_16U )\r
-                        improc::YCrCb2RGB_gpu_16u(src, scn, out, dcn, bidx, coeffs_i, stream);\r
-                    else\r
-                        improc::YCrCb2RGB_gpu_32f(src, scn, out, dcn, bidx, coeffs_f, stream);\r
+                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;\r
+\r
+                    funcs[depth](src, scn, dst, dcn, bidx, coeffs, stream);\r
+                    break;\r
                 }\r
-                break;\r
             \r
             case CV_BGR2XYZ: case CV_RGB2XYZ:\r
-                {                    \r
-                    if(dcn <= 0) dcn = 3;\r
-                    CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) );\r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, \r
+                        const void* coeffs, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::RGB2XYZ_gpu_8u, 0, improc::RGB2XYZ_gpu_16u, 0, 0, improc::RGB2XYZ_gpu_32f};\r
+\r
+                    if (dcn <= 0) dcn = 3;\r
+\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));\r
 \r
                     bidx = code == CV_BGR2XYZ ? 0 : 2;\r
 \r
@@ -479,8 +507,8 @@ namespace
 \r
                     float coeffs_f[9];\r
                     int coeffs_i[9];\r
-                    ::memcpy(coeffs_f, RGB2XYZ_D65f, 9 * sizeof(float));\r
-                    ::memcpy(coeffs_i, RGB2XYZ_D65i, 9 * sizeof(int));\r
+                    ::memcpy(coeffs_f, RGB2XYZ_D65f, sizeof(RGB2XYZ_D65f));\r
+                    ::memcpy(coeffs_i, RGB2XYZ_D65i, sizeof(RGB2XYZ_D65i));\r
 \r
                     if (bidx == 0) \r
                     {\r
@@ -493,21 +521,24 @@ namespace
                         std::swap(coeffs_i[6], coeffs_i[8]);\r
                     }\r
                         \r
-                    out.create(sz, CV_MAKETYPE(depth, dcn));\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
                     \r
-                    if( depth == CV_8U )\r
-                        improc::RGB2XYZ_gpu_8u(src, scn, out, dcn, coeffs_i, stream);\r
-                    else if( depth == CV_16U )\r
-                        improc::RGB2XYZ_gpu_16u(src, scn, out, dcn, coeffs_i, stream);\r
-                    else\r
-                        improc::RGB2XYZ_gpu_32f(src, scn, out, dcn, coeffs_f, stream);\r
+                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;\r
+                    \r
+                    funcs[depth](src, scn, dst, dcn, coeffs, stream);\r
+                    break;\r
                 }\r
-                break;\r
             \r
             case CV_XYZ2BGR: case CV_XYZ2RGB:\r
                 {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, \r
+                        const void* coeffs, cudaStream_t stream);\r
+                    static const func_t funcs[] = {improc::XYZ2RGB_gpu_8u, 0, improc::XYZ2RGB_gpu_16u, 0, 0, improc::XYZ2RGB_gpu_32f};\r
+\r
                     if (dcn <= 0) dcn = 3;\r
-                    CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) );\r
+\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));\r
+\r
                     bidx = code == CV_XYZ2BGR ? 0 : 2;\r
 \r
                     static const float XYZ2sRGB_D65f[] =\r
@@ -525,8 +556,8 @@ namespace
 \r
                     float coeffs_f[9];\r
                     int coeffs_i[9];\r
-                    ::memcpy(coeffs_f, XYZ2sRGB_D65f, 9 * sizeof(float));\r
-                    ::memcpy(coeffs_i, XYZ2sRGB_D65i, 9 * sizeof(int));\r
+                    ::memcpy(coeffs_f, XYZ2sRGB_D65f, sizeof(XYZ2sRGB_D65f));\r
+                    ::memcpy(coeffs_i, XYZ2sRGB_D65i, sizeof(XYZ2sRGB_D65i));\r
 \r
                     if (bidx == 0) \r
                     {\r
@@ -539,153 +570,69 @@ namespace
                         std::swap(coeffs_i[2], coeffs_i[8]);\r
                     }\r
                         \r
-                    out.create(sz, CV_MAKETYPE(depth, dcn));\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
                     \r
-                    if( depth == CV_8U )\r
-                        improc::XYZ2RGB_gpu_8u(src, scn, out, dcn, coeffs_i, stream);\r
-                    else if( depth == CV_16U )\r
-                        improc::XYZ2RGB_gpu_16u(src, scn, out, dcn, coeffs_i, stream);\r
-                    else\r
-                        improc::XYZ2RGB_gpu_32f(src, scn, out, dcn, coeffs_f, stream);\r
+                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;\r
+\r
+                    funcs[depth](src, scn, dst, dcn, coeffs_i, stream);\r
+                    break;\r
                 }\r
-                break;\r
-                \r
-            //case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:\r
-            //case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:\r
-            //    {\r
-            //    CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );\r
-            //    bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||\r
-            //        code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;\r
-            //    int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||\r
-            //        code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 255;\r
-            //    \r
-            //    dst.create(sz, CV_MAKETYPE(depth, 3));\r
-            //    \r
-            //    if( code == CV_BGR2HSV || code == CV_RGB2HSV ||\r
-            //        code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));\r
-            //    }\r
-            //    else\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));\r
-            //    }\r
-            //    }\r
-            //    break;\r
-            \r
-            //case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:\r
-            //case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:\r
-            //    {\r
-            //    if( dcn <= 0 ) dcn = 3;\r
-            //    CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );\r
-            //    bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||\r
-            //        code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;\r
-            //    int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||\r
-            //        code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;\r
-            //    \r
-            //    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
-            //    \r
-            //    if( code == CV_HSV2BGR || code == CV_HSV2RGB ||\r
-            //        code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));\r
-            //    }\r
-            //    else\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));\r
-            //    }\r
-            //    }\r
-            //    break;\r
-                \r
-            //case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:\r
-            //case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:\r
-            //    {\r
-            //    CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );\r
-            //    bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||\r
-            //           code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;\r
-            //    bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||\r
-            //                code == CV_BGR2Luv || code == CV_RGB2Luv;\r
-            //    \r
-            //    dst.create(sz, CV_MAKETYPE(depth, 3));\r
-            //    \r
-            //    if( code == CV_BGR2Lab || code == CV_RGB2Lab ||\r
-            //        code == CV_LBGR2Lab || code == CV_LRGB2Lab )\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));\r
-            //    }\r
-            //    else\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));\r
-            //    }\r
-            //    }\r
-            //    break;\r
-            \r
-            //case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:\r
-            //case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:\r
-            //    {\r
-            //    if( dcn <= 0 ) dcn = 3;\r
-            //    CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );\r
-            //    bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||\r
-            //           code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;\r
-            //    bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||\r
-            //            code == CV_Luv2BGR || code == CV_Luv2RGB;\r
-            //    \r
-            //    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
-            //    \r
-            //    if( code == CV_Lab2BGR || code == CV_Lab2RGB ||\r
-            //        code == CV_Lab2LBGR || code == CV_Lab2LRGB )\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));\r
-            //    }\r
-            //    else\r
-            //    {\r
-            //        if( depth == CV_8U )\r
-            //            CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));\r
-            //        else\r
-            //            CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));\r
-            //    }\r
-            //    }\r
-            //    break;\r
+\r
+            case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:\r
+            case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:\r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, \r
+                        int hrange, cudaStream_t stream);\r
+                    static const func_t funcs_hsv[] = {improc::RGB2HSV_gpu_8u, 0, 0, 0, 0, improc::RGB2HSV_gpu_32f};\r
+                    static const func_t funcs_hls[] = {improc::RGB2HLS_gpu_8u, 0, 0, 0, 0, improc::RGB2HLS_gpu_32f};\r
+\r
+                    if (dcn <= 0) dcn = 3;\r
+\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));\r
+\r
+                    bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||\r
+                        code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;\r
+                    int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||\r
+                        code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 255;\r
                 \r
-            //case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:\r
-            //case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:\r
-            //    if(dcn <= 0) dcn = 3;\r
-            //    CV_Assert( scn == 1 && dcn == 3 && depth == CV_8U );\r
-            //    dst.create(sz, CV_8UC3);\r
-            //    \r
-            //    if( code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ||\r
-            //        code == CV_BayerRG2BGR || code == CV_BayerGR2BGR )\r
-            //        Bayer2RGB_8u(src, dst, code);\r
-            //    else\r
-            //        Bayer2RGB_VNG_8u(src, dst, code);\r
-            //    break;\r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    if (code == CV_BGR2HSV || code == CV_RGB2HSV || code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL) \r
+                        funcs_hsv[depth](src, scn, dst, dcn, bidx, hrange, stream);\r
+                    else\r
+                        funcs_hls[depth](src, scn, dst, dcn, bidx, hrange, stream);\r
+                    break;\r
+                }\r
+\r
+            case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:\r
+            case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:\r
+                {\r
+                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, \r
+                        int hrange, cudaStream_t stream);\r
+                    static const func_t funcs_hsv[] = {improc::HSV2RGB_gpu_8u, 0, 0, 0, 0, improc::HSV2RGB_gpu_32f};\r
+                    static const func_t funcs_hls[] = {improc::HLS2RGB_gpu_8u, 0, 0, 0, 0, improc::HLS2RGB_gpu_32f};\r
+\r
+                    if (dcn <= 0) dcn = 3;\r
+\r
+                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));\r
+\r
+                    bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||\r
+                        code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;\r
+                    int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||\r
+                        code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;\r
+                    \r
+                    dst.create(sz, CV_MAKETYPE(depth, dcn));\r
+\r
+                    if (code == CV_HSV2BGR || code == CV_HSV2RGB || code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL)\r
+                        funcs_hsv[depth](src, scn, dst, dcn, bidx, hrange, stream);\r
+                    else\r
+                        funcs_hls[depth](src, scn, dst, dcn, bidx, hrange, stream);\r
+                    break;\r
+                }\r
 \r
             default:\r
                 CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );\r
         }\r
-\r
-        dst = out;\r
     }\r
 }\r
 \r
index c7fc60d..fae1f93 100644 (file)
@@ -459,7 +459,7 @@ int CV_GpuCvtColorTest::CheckNorm(const Mat& m1, const Mat& m2)
 {\r
     double ret = norm(m1, m2, NORM_INF);\r
 \r
-    if (ret < std::numeric_limits<double>::epsilon())\r
+    if (ret <= 2)\r
     {\r
         return CvTS::OK;\r
     }\r
@@ -472,7 +472,6 @@ int CV_GpuCvtColorTest::CheckNorm(const Mat& m1, const Mat& m2)
 \r
 void CV_GpuCvtColorTest::run( int )\r
 {\r
-    //load image\r
     cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "stereobp/aloe-L.png");\r
 \r
     if (img.empty())\r
@@ -486,17 +485,20 @@ void CV_GpuCvtColorTest::run( int )
     cv::gpu::GpuMat gpuImg(img), gpuRes;\r
     try\r
     {\r
-        //run tests\r
         int codes[] = { CV_BGR2RGB, CV_RGB2BGRA, CV_BGRA2RGB,\r
                         CV_RGB2BGR555, CV_BGR5552BGR, CV_BGR2BGR565, CV_BGR5652RGB, \r
                         CV_RGB2YCrCb, CV_YCrCb2BGR, CV_BGR2YUV, CV_YUV2RGB,\r
                         CV_RGB2XYZ, CV_XYZ2BGR, CV_BGR2XYZ, CV_XYZ2RGB,\r
+                        CV_RGB2HSV, CV_HSV2BGR, CV_BGR2HSV_FULL, CV_HSV2RGB_FULL,\r
+                        CV_RGB2HLS, CV_HLS2BGR, CV_BGR2HLS_FULL, CV_HLS2RGB_FULL,\r
                         CV_RGB2GRAY, CV_GRAY2BGRA, CV_BGRA2GRAY,\r
                         CV_GRAY2BGR555, CV_BGR5552GRAY, CV_GRAY2BGR565, CV_BGR5652GRAY};\r
         const char* codes_str[] = { "CV_BGR2RGB", "CV_RGB2BGRA", "CV_BGRA2RGB",\r
                                     "CV_RGB2BGR555", "CV_BGR5552BGR", "CV_BGR2BGR565", "CV_BGR5652RGB", \r
                                     "CV_RGB2YCrCb", "CV_YCrCb2BGR", "CV_BGR2YUV", "CV_YUV2RGB",\r
                                     "CV_RGB2XYZ", "CV_XYZ2BGR", "CV_BGR2XYZ", "CV_XYZ2RGB",\r
+                                    "CV_RGB2HSV", "CV_HSV2RGB", "CV_BGR2HSV_FULL", "CV_HSV2RGB_FULL",\r
+                                    "CV_RGB2HLS", "CV_HLS2RGB", "CV_BGR2HLS_FULL", "CV_HLS2RGB_FULL",\r
                                     "CV_RGB2GRAY", "CV_GRAY2BGRA", "CV_BGRA2GRAY",\r
                                     "CV_GRAY2BGR555", "CV_BGR5552GRAY", "CV_GRAY2BGR565", "CV_BGR5652GRAY"};\r
         int codes_num = sizeof(codes) / sizeof(int);\r