////////////////////////////////////////////////////////////////////////\r
// Polar <-> Cart\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace mathfunc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
- void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace mathfunc \r
+ {\r
+ void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
+ void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;\r
+ using namespace ::cv::gpu::device::mathfunc;\r
\r
CV_DbgAssert(x.size() == y.size() && x.type() == y.type());\r
CV_Assert(x.depth() == CV_32F);\r
\r
inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;\r
+ using namespace ::cv::gpu::device::mathfunc;\r
\r
CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());\r
CV_Assert(mag.depth() == CV_32F);\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace bilateral_filter\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);\r
-\r
- void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
- void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
-}\r
+ namespace bilateral_filter\r
+ {\r
+ void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
+ void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;\r
+using namespace ::cv::gpu::device::bilateral_filter;\r
\r
namespace\r
{\r
\r
#else\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace blend\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T>\r
- void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);\r
-\r
- void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);\r
-}\r
+ namespace blend\r
+ {\r
+ template <typename T>\r
+ void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE_ blend;\r
+using namespace ::cv::gpu::device::blend;\r
\r
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, \r
GpuMat& result, Stream& stream)\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace bf_match\r
-{\r
- template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream);\r
-\r
- template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,\r
- int cc, cudaStream_t stream);\r
-}\r
-\r
-namespace bf_knnmatch\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream);\r
-\r
- template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream);\r
-}\r
+ namespace bf_match\r
+ {\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,\r
+ int cc, cudaStream_t stream);\r
+ }\r
\r
-namespace bf_radius_match \r
-{\r
- template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
- template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
-\r
- template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
-\r
- template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
-\r
- template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream);\r
-}\r
+ namespace bf_knnmatch\r
+ {\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace bf_radius_match \r
+ {\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
////////////////////////////////////////////////////////////////////\r
// Train collection\r
if (query.empty() || train.empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;\r
+ using namespace ::cv::gpu::device::bf_match;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
if (query.empty() || trainCollection.empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;\r
+ using namespace ::cv::gpu::device::bf_match;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
if (query.empty() || train.empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;\r
+ using namespace ::cv::gpu::device::bf_knnmatch;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
if (query.empty() || trainCollection.empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;\r
+ using namespace ::cv::gpu::device::bf_knnmatch;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
if (query.empty() || train.empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;\r
+ using namespace ::cv::gpu::device::bf_radius_match;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
if (query.empty() || empty())\r
return;\r
\r
- using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;\r
+ using namespace ::cv::gpu::device::bf_radius_match;\r
\r
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
\r
#else\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace transform_points \r
-{\r
- void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);\r
-}\r
-\r
-namespace project_points \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);\r
-}\r
+ namespace transform_points \r
+ {\r
+ void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);\r
+ }\r
\r
-namespace solve_pnp_ransac\r
-{\r
- int maxNumIters();\r
+ namespace project_points \r
+ {\r
+ void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);\r
+ }\r
\r
- void computeHypothesisScores(\r
- const int num_hypotheses, const int num_points, const float* rot_matrices,\r
- const float3* transl_vectors, const float3* object, const float2* image,\r
- const float dist_threshold, int* hypothesis_scores);\r
-}\r
+ namespace solve_pnp_ransac\r
+ {\r
+ int maxNumIters();\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void computeHypothesisScores(\r
+ const int num_hypotheses, const int num_points, const float* rot_matrices,\r
+ const float3* transl_vectors, const float3* object, const float2* image,\r
+ const float dist_threshold, int* hypothesis_scores);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE;\r
+using namespace ::cv::gpu::device;\r
\r
namespace\r
{\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
+namespace cv { namespace gpu { namespace device \r
+{\r
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \\r
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
\r
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \\r
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)\r
\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)\r
-\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)\r
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)\r
-\r
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE\r
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL\r
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
-\r
-using namespace OPENCV_DEVICE_NAMESPACE;\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)\r
+\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)\r
+ OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)\r
+\r
+ #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE\r
+ #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL\r
+ #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F\r
+}}}\r
+\r
+using namespace ::cv::gpu::device;\r
\r
namespace\r
{\r
#include "opencv2/gpu/device/vec_distance.hpp"\r
#include "opencv2/gpu/device/datamov_utils.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace bf_knnmatch {\r
-\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Reduction\r
-\r
-template <int BLOCK_SIZE> \r
-__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
- int& bestTrainIdx1, int& bestTrainIdx2, \r
- float* s_distance, int* s_trainIdx)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- float myBestDistance1 = numeric_limits<float>::max(); \r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
+ namespace bf_knnmatch \r
+ {\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Reduction\r
\r
- s_distance += threadIdx.y * BLOCK_SIZE;\r
- s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+ template <int BLOCK_SIZE> \r
+ __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
+ int& bestTrainIdx1, int& bestTrainIdx2, \r
+ float* s_distance, int* s_trainIdx)\r
+ {\r
+ float myBestDistance1 = numeric_limits<float>::max(); \r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
\r
- s_distance[threadIdx.x] = bestDistance1;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
+ s_distance += threadIdx.y * BLOCK_SIZE;\r
+ s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
\r
- __syncthreads();\r
+ s_distance[threadIdx.x] = bestDistance1;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
\r
- if (threadIdx.x == 0)\r
- {\r
- #pragma unroll\r
- for (int i = 0; i < BLOCK_SIZE; ++i)\r
- {\r
- float val = s_distance[i];\r
+ __syncthreads();\r
\r
- if (val < myBestDistance1)\r
+ if (threadIdx.x == 0)\r
{\r
- myBestDistance2 = myBestDistance1;\r
- myBestTrainIdx2 = myBestTrainIdx1;\r
-\r
- myBestDistance1 = val;\r
- myBestTrainIdx1 = s_trainIdx[i];\r
+ #pragma unroll\r
+ for (int i = 0; i < BLOCK_SIZE; ++i)\r
+ {\r
+ float val = s_distance[i];\r
+\r
+ if (val < myBestDistance1)\r
+ {\r
+ myBestDistance2 = myBestDistance1;\r
+ myBestTrainIdx2 = myBestTrainIdx1;\r
+\r
+ myBestDistance1 = val;\r
+ myBestTrainIdx1 = s_trainIdx[i];\r
+ }\r
+ else if (val < myBestDistance2)\r
+ {\r
+ myBestDistance2 = val;\r
+ myBestTrainIdx2 = s_trainIdx[i];\r
+ }\r
+ }\r
}\r
- else if (val < myBestDistance2)\r
- {\r
- myBestDistance2 = val;\r
- myBestTrainIdx2 = s_trainIdx[i];\r
- }\r
- }\r
- }\r
-\r
- __syncthreads();\r
\r
- s_distance[threadIdx.x] = bestDistance2;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ s_distance[threadIdx.x] = bestDistance2;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
\r
- if (threadIdx.x == 0)\r
- {\r
- #pragma unroll\r
- for (int i = 0; i < BLOCK_SIZE; ++i)\r
- {\r
- float val = s_distance[i];\r
+ __syncthreads();\r
\r
- if (val < myBestDistance2)\r
+ if (threadIdx.x == 0)\r
{\r
- myBestDistance2 = val;\r
- myBestTrainIdx2 = s_trainIdx[i];\r
+ #pragma unroll\r
+ for (int i = 0; i < BLOCK_SIZE; ++i)\r
+ {\r
+ float val = s_distance[i];\r
+\r
+ if (val < myBestDistance2)\r
+ {\r
+ myBestDistance2 = val;\r
+ myBestTrainIdx2 = s_trainIdx[i];\r
+ }\r
+ }\r
}\r
- }\r
- }\r
-\r
- bestDistance1 = myBestDistance1;\r
- bestDistance2 = myBestDistance2;\r
\r
- bestTrainIdx1 = myBestTrainIdx1;\r
- bestTrainIdx2 = myBestTrainIdx2;\r
-}\r
+ bestDistance1 = myBestDistance1;\r
+ bestDistance2 = myBestDistance2;\r
\r
-template <int BLOCK_SIZE> \r
-__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
- int& bestTrainIdx1, int& bestTrainIdx2, \r
- int& bestImgIdx1, int& bestImgIdx2, \r
- float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
-{\r
- float myBestDistance1 = numeric_limits<float>::max(); \r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
- int myBestImgIdx1 = -1;\r
- int myBestImgIdx2 = -1;\r
+ bestTrainIdx1 = myBestTrainIdx1;\r
+ bestTrainIdx2 = myBestTrainIdx2;\r
+ }\r
\r
- s_distance += threadIdx.y * BLOCK_SIZE;\r
- s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
- s_imgIdx += threadIdx.y * BLOCK_SIZE;\r
+ template <int BLOCK_SIZE> \r
+ __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
+ int& bestTrainIdx1, int& bestTrainIdx2, \r
+ int& bestImgIdx1, int& bestImgIdx2, \r
+ float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
+ {\r
+ float myBestDistance1 = numeric_limits<float>::max(); \r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
+ int myBestImgIdx1 = -1;\r
+ int myBestImgIdx2 = -1;\r
\r
- s_distance[threadIdx.x] = bestDistance1;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
- s_imgIdx[threadIdx.x] = bestImgIdx1;\r
+ s_distance += threadIdx.y * BLOCK_SIZE;\r
+ s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+ s_imgIdx += threadIdx.y * BLOCK_SIZE;\r
\r
- __syncthreads();\r
+ s_distance[threadIdx.x] = bestDistance1;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
+ s_imgIdx[threadIdx.x] = bestImgIdx1;\r
\r
- if (threadIdx.x == 0)\r
- {\r
- #pragma unroll\r
- for (int i = 0; i < BLOCK_SIZE; ++i)\r
- {\r
- float val = s_distance[i];\r
-\r
- if (val < myBestDistance1)\r
- {\r
- myBestDistance2 = myBestDistance1;\r
- myBestTrainIdx2 = myBestTrainIdx1;\r
- myBestImgIdx2 = myBestImgIdx1;\r
+ __syncthreads();\r
\r
- myBestDistance1 = val;\r
- myBestTrainIdx1 = s_trainIdx[i];\r
- myBestImgIdx1 = s_imgIdx[i];\r
- }\r
- else if (val < myBestDistance2)\r
+ if (threadIdx.x == 0)\r
{\r
- myBestDistance2 = val;\r
- myBestTrainIdx2 = s_trainIdx[i];\r
- myBestImgIdx2 = s_imgIdx[i];\r
+ #pragma unroll\r
+ for (int i = 0; i < BLOCK_SIZE; ++i)\r
+ {\r
+ float val = s_distance[i];\r
+\r
+ if (val < myBestDistance1)\r
+ {\r
+ myBestDistance2 = myBestDistance1;\r
+ myBestTrainIdx2 = myBestTrainIdx1;\r
+ myBestImgIdx2 = myBestImgIdx1;\r
+\r
+ myBestDistance1 = val;\r
+ myBestTrainIdx1 = s_trainIdx[i];\r
+ myBestImgIdx1 = s_imgIdx[i];\r
+ }\r
+ else if (val < myBestDistance2)\r
+ {\r
+ myBestDistance2 = val;\r
+ myBestTrainIdx2 = s_trainIdx[i];\r
+ myBestImgIdx2 = s_imgIdx[i];\r
+ }\r
+ }\r
}\r
- }\r
- }\r
-\r
- __syncthreads();\r
\r
- s_distance[threadIdx.x] = bestDistance2;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
- s_imgIdx[threadIdx.x] = bestImgIdx2;\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ s_distance[threadIdx.x] = bestDistance2;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
+ s_imgIdx[threadIdx.x] = bestImgIdx2;\r
\r
- if (threadIdx.x == 0)\r
- {\r
- #pragma unroll\r
- for (int i = 0; i < BLOCK_SIZE; ++i)\r
- {\r
- float val = s_distance[i];\r
+ __syncthreads();\r
\r
- if (val < myBestDistance2)\r
+ if (threadIdx.x == 0)\r
{\r
- myBestDistance2 = val;\r
- myBestTrainIdx2 = s_trainIdx[i];\r
- myBestImgIdx2 = s_imgIdx[i];\r
+ #pragma unroll\r
+ for (int i = 0; i < BLOCK_SIZE; ++i)\r
+ {\r
+ float val = s_distance[i];\r
+\r
+ if (val < myBestDistance2)\r
+ {\r
+ myBestDistance2 = val;\r
+ myBestTrainIdx2 = s_trainIdx[i];\r
+ myBestImgIdx2 = s_imgIdx[i];\r
+ }\r
+ }\r
}\r
- }\r
- }\r
\r
- bestDistance1 = myBestDistance1;\r
- bestDistance2 = myBestDistance2;\r
+ bestDistance1 = myBestDistance1;\r
+ bestDistance2 = myBestDistance2;\r
\r
- bestTrainIdx1 = myBestTrainIdx1;\r
- bestTrainIdx2 = myBestTrainIdx2;\r
+ bestTrainIdx1 = myBestTrainIdx1;\r
+ bestTrainIdx2 = myBestTrainIdx2;\r
\r
- bestImgIdx1 = myBestImgIdx1;\r
- bestImgIdx2 = myBestImgIdx2;\r
-}\r
+ bestImgIdx1 = myBestImgIdx1;\r
+ bestImgIdx2 = myBestImgIdx2;\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match Unrolled Cached\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match Unrolled Cached\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
-__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
-{\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
- s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance1, float& bestDistance2, \r
- int& bestTrainIdx1, int& bestTrainIdx2, \r
- int& bestImgIdx1, int& bestImgIdx2)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
-\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
+ __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
{\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
- if (loadX < train.cols)\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
{\r
- T val;\r
-\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
}\r
-\r
- __syncthreads();\r
-\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
- __syncthreads();\r
}\r
\r
- typename Dist::result_type distVal = dist;\r
-\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
-\r
- if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance1, float& bestDistance2, \r
+ int& bestTrainIdx1, int& bestTrainIdx2, \r
+ int& bestImgIdx1, int& bestImgIdx2)\r
{\r
- if (distVal < bestDistance1)\r
- {\r
- bestImgIdx2 = bestImgIdx1;\r
- bestDistance2 = bestDistance1;\r
- bestTrainIdx2 = bestTrainIdx1;\r
-\r
- bestImgIdx1 = imgIdx;\r
- bestDistance1 = distVal;\r
- bestTrainIdx1 = trainIdx;\r
- }\r
- else if (distVal < bestDistance2)\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
{\r
- bestImgIdx2 = imgIdx;\r
- bestDistance2 = distVal;\r
- bestTrainIdx2 = trainIdx;\r
+ Dist dist;\r
+\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+\r
+ if (loadX < train.cols)\r
+ {\r
+ T val;\r
+\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
+\r
+ typename Dist::result_type distVal = dist;\r
+\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+\r
+ if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+ {\r
+ if (distVal < bestDistance1)\r
+ {\r
+ bestImgIdx2 = bestImgIdx1;\r
+ bestDistance2 = bestDistance1;\r
+ bestTrainIdx2 = bestTrainIdx1;\r
+\r
+ bestImgIdx1 = imgIdx;\r
+ bestDistance1 = distVal;\r
+ bestTrainIdx1 = trainIdx;\r
+ }\r
+ else if (distVal < bestDistance2)\r
+ {\r
+ bestImgIdx2 = imgIdx;\r
+ bestDistance2 = distVal;\r
+ bestTrainIdx2 = trainIdx;\r
+ }\r
+ }\r
}\r
}\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
-\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
\r
- loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
-\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
-\r
- loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
-\r
- __syncthreads();\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
\r
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
\r
- matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ __syncthreads();\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
+ }\r
+ }\r
\r
- loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
- int myBestImgIdx1 = -1;\r
- int myBestImgIdx2 = -1;\r
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- Mask m = mask;\r
+ matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
- }\r
-\r
- __syncthreads();\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
\r
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
\r
- matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
+ int myBestImgIdx1 = -1;\r
+ int myBestImgIdx2 = -1;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ Mask m = mask;\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match Unrolled\r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance1, float& bestDistance2, \r
- int& bestTrainIdx1, int& bestTrainIdx2, \r
- int& bestImgIdx1, int& bestImgIdx2)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
+ __syncthreads();\r
\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
\r
- if (loadX < query.cols)\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
{\r
- T val;\r
-\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
-\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
}\r
+ }\r
\r
- __syncthreads();\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- __syncthreads();\r
- }\r
+ matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- typename Dist::result_type distVal = dist;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match Unrolled\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance1, float& bestDistance2, \r
+ int& bestTrainIdx1, int& bestTrainIdx2, \r
+ int& bestImgIdx1, int& bestImgIdx2)\r
{\r
- if (distVal < bestDistance1)\r
- {\r
- bestImgIdx2 = bestImgIdx1;\r
- bestDistance2 = bestDistance1;\r
- bestTrainIdx2 = bestTrainIdx1;\r
-\r
- bestImgIdx1 = imgIdx;\r
- bestDistance1 = distVal;\r
- bestTrainIdx1 = trainIdx;\r
- }\r
- else if (distVal < bestDistance2)\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
{\r
- bestImgIdx2 = imgIdx;\r
- bestDistance2 = distVal;\r
- bestTrainIdx2 = trainIdx;\r
+ Dist dist;\r
+\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
+\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
+\r
+ typename Dist::result_type distVal = dist;\r
+\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+\r
+ if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+ {\r
+ if (distVal < bestDistance1)\r
+ {\r
+ bestImgIdx2 = bestImgIdx1;\r
+ bestDistance2 = bestDistance1;\r
+ bestTrainIdx2 = bestTrainIdx1;\r
+\r
+ bestImgIdx1 = imgIdx;\r
+ bestDistance1 = distVal;\r
+ bestTrainIdx1 = trainIdx;\r
+ }\r
+ else if (distVal < bestDistance2)\r
+ {\r
+ bestImgIdx2 = imgIdx;\r
+ bestDistance2 = distVal;\r
+ bestTrainIdx2 = trainIdx;\r
+ }\r
+ }\r
}\r
}\r
- }\r
-}\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
\r
- loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
+ loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
+ }\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
- int myBestImgIdx1 = -1;\r
- int myBestImgIdx2 = -1;\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
+ int myBestImgIdx1 = -1;\r
+ int myBestImgIdx2 = -1;\r
\r
- Mask m = mask;\r
+ Mask m = mask;\r
\r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
- }\r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+ }\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
+ }\r
+ }\r
\r
- const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance1, float& bestDistance2, \r
- int& bestTrainIdx1, int& bestTrainIdx2, \r
- int& bestImgIdx1, int& bestImgIdx2)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match\r
\r
- for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance1, float& bestDistance2, \r
+ int& bestTrainIdx1, int& bestTrainIdx2, \r
+ int& bestImgIdx1, int& bestImgIdx2)\r
{\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
- if (loadX < query.cols)\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
{\r
- T val;\r
+ Dist dist;\r
+\r
+ for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
+\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
+\r
+ typename Dist::result_type distVal = dist;\r
+\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+\r
+ if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+ {\r
+ if (distVal < bestDistance1)\r
+ {\r
+ bestImgIdx2 = bestImgIdx1;\r
+ bestDistance2 = bestDistance1;\r
+ bestTrainIdx2 = bestTrainIdx1;\r
+\r
+ bestImgIdx1 = imgIdx;\r
+ bestDistance1 = distVal;\r
+ bestTrainIdx1 = trainIdx;\r
+ }\r
+ else if (distVal < bestDistance2)\r
+ {\r
+ bestImgIdx2 = imgIdx;\r
+ bestDistance2 = distVal;\r
+ bestTrainIdx2 = trainIdx;\r
+ }\r
+ }\r
+ }\r
+ }\r
\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
- }\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- __syncthreads();\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
+\r
+ loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
\r
__syncthreads();\r
- }\r
\r
- typename Dist::result_type distVal = dist;\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
- {\r
- if (distVal < bestDistance1)\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
{\r
- bestImgIdx2 = bestImgIdx1;\r
- bestDistance2 = bestDistance1;\r
- bestTrainIdx2 = bestTrainIdx1;\r
-\r
- bestImgIdx1 = imgIdx;\r
- bestDistance1 = distVal;\r
- bestTrainIdx1 = trainIdx;\r
- }\r
- else if (distVal < bestDistance2)\r
- {\r
- bestImgIdx2 = imgIdx;\r
- bestDistance2 = distVal;\r
- bestTrainIdx2 = trainIdx;\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
}\r
}\r
- }\r
-}\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- __syncthreads();\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+ float myBestDistance1 = numeric_limits<float>::max();\r
+ float myBestDistance2 = numeric_limits<float>::max();\r
+ int myBestTrainIdx1 = -1;\r
+ int myBestTrainIdx2 = -1;\r
+ int myBestImgIdx1 = -1;\r
+ int myBestImgIdx2 = -1;\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ Mask m = mask;\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+ }\r
\r
- match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __syncthreads();\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+ bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+ bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
+ }\r
+ }\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- float myBestDistance1 = numeric_limits<float>::max();\r
- float myBestDistance2 = numeric_limits<float>::max();\r
- int myBestTrainIdx1 = -1;\r
- int myBestTrainIdx2 = -1;\r
- int myBestImgIdx1 = -1;\r
- int myBestImgIdx2 = -1;\r
+ const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- Mask m = mask;\r
+ match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
- }\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- __syncthreads();\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // knnMatch 2 dispatcher\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ template <typename Dist, typename T, typename Mask> \r
+ void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ }\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+ template <typename Dist, typename T, typename Mask> \r
+ void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+ }\r
+ }\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
- bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
- bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Calc distance kernel\r
\r
- const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+ __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+ const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// knnMatch 2 dispatcher\r
+ Dist dist;\r
\r
-template <typename Dist, typename T, typename Mask> \r
-void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
-}\r
-\r
-template <typename Dist, typename T, typename Mask> \r
-void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
- }\r
-}\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+\r
+ if (loadX < query.cols)\r
+ {\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
+ }\r
+ else\r
+ { \r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Calc distance kernel\r
+ if (queryIdx < query.rows && trainIdx < train.rows)\r
+ {\r
+ float distVal = numeric_limits<float>::max();\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-__global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
-{\r
- extern __shared__ int smem[];\r
+ if (mask(queryIdx, trainIdx))\r
+ distVal = (typename Dist::result_type)dist;\r
\r
- const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
- const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+ allDist.ptr(queryIdx)[trainIdx] = distVal;\r
+ }\r
+ }\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
\r
- Dist dist;\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (loadX < query.cols)\r
- {\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
- }\r
- else\r
- { \r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
\r
- __syncthreads();\r
-\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
- __syncthreads();\r
- }\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+ __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows)\r
- {\r
- float distVal = numeric_limits<float>::max();\r
+ const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+ const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
\r
- if (mask(queryIdx, trainIdx))\r
- distVal = (typename Dist::result_type)dist;\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- allDist.ptr(queryIdx)[trainIdx] = distVal;\r
- }\r
-}\r
+ Dist dist;\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+\r
+ if (loadX < query.cols)\r
+ {\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
+ }\r
+ else\r
+ { \r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ if (queryIdx < query.rows && trainIdx < train.rows)\r
+ {\r
+ float distVal = numeric_limits<float>::max();\r
\r
- calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (mask(queryIdx, trainIdx))\r
+ distVal = (typename Dist::result_type)dist;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ allDist.ptr(queryIdx)[trainIdx] = distVal;\r
+ }\r
+ }\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
\r
- const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
- const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- Dist dist;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Calc Distance dispatcher\r
\r
- if (loadX < query.cols)\r
+ template <typename Dist, typename T, typename Mask> \r
+ void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream)\r
{\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
- }\r
- else\r
- { \r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ if (query.cols <= 64)\r
+ {\r
+ calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);\r
+ }*/\r
+ else\r
+ {\r
+ calcDistance<16, Dist>(query, train, mask, allDist, stream);\r
+ }\r
}\r
\r
- __syncthreads();\r
-\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // find knn match kernel\r
\r
- __syncthreads();\r
- }\r
+ template <int BLOCK_SIZE> \r
+ __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)\r
+ {\r
+ const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
+ __shared__ float s_dist[SMEM_SIZE];\r
+ __shared__ int s_trainIdx[SMEM_SIZE];\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows)\r
- {\r
- float distVal = numeric_limits<float>::max();\r
+ const int queryIdx = blockIdx.x;\r
\r
- if (mask(queryIdx, trainIdx))\r
- distVal = (typename Dist::result_type)dist;\r
+ float* allDistRow = allDist.ptr(queryIdx);\r
\r
- allDist.ptr(queryIdx)[trainIdx] = distVal;\r
- }\r
-}\r
+ float dist = numeric_limits<float>::max();\r
+ int bestIdx = -1;\r
+ \r
+ for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)\r
+ {\r
+ float reg = allDistRow[i];\r
+ if (reg < dist)\r
+ {\r
+ dist = reg;\r
+ bestIdx = i;\r
+ }\r
+ }\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ s_dist[threadIdx.x] = dist;\r
+ s_trainIdx[threadIdx.x] = bestIdx;\r
+ __syncthreads();\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
\r
- calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (threadIdx.x == 0)\r
+ {\r
+ if (dist < numeric_limits<float>::max())\r
+ {\r
+ allDistRow[bestIdx] = numeric_limits<float>::max();\r
+ trainIdx.ptr(queryIdx)[i] = bestIdx;\r
+ distance.ptr(queryIdx)[i] = dist;\r
+ }\r
+ }\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ template <int BLOCK_SIZE> \r
+ void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, 1, 1);\r
+ const dim3 grid(trainIdx.rows, 1, 1);\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Calc Distance dispatcher\r
+ for (int i = 0; i < k; ++i)\r
+ {\r
+ findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
\r
-template <typename Dist, typename T, typename Mask> \r
-void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);\r
- }*/\r
- else\r
- {\r
- calcDistance<16, Dist>(query, train, mask, allDist, stream);\r
- }\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// find knn match kernel\r
+ void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)\r
+ {\r
+ findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);\r
+ }\r
\r
-template <int BLOCK_SIZE> \r
-__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)\r
-{\r
- const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
- __shared__ float s_dist[SMEM_SIZE];\r
- __shared__ int s_trainIdx[SMEM_SIZE];\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // knn match Dispatcher\r
\r
- const int queryIdx = blockIdx.x;\r
+ template <typename Dist, typename T, typename Mask>\r
+ void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (k == 2)\r
+ {\r
+ match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);\r
+ }\r
+ else\r
+ {\r
+ calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);\r
+ findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);\r
+ }\r
+ } \r
\r
- float* allDistRow = allDist.ptr(queryIdx);\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // knn match caller\r
\r
- float dist = numeric_limits<float>::max();\r
- int bestIdx = -1;\r
- \r
- for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)\r
- {\r
- float reg = allDistRow[i];\r
- if (reg < dist)\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream)\r
{\r
- dist = reg;\r
- bestIdx = i;\r
+ if (mask.data)\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+ else\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
}\r
- }\r
\r
- s_dist[threadIdx.x] = dist;\r
- s_trainIdx[threadIdx.x] = bestIdx;\r
- __syncthreads();\r
+ template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
\r
- reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
-\r
- if (threadIdx.x == 0)\r
- {\r
- if (dist < numeric_limits<float>::max())\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,\r
+ int cc, cudaStream_t stream)\r
{\r
- allDistRow[bestIdx] = numeric_limits<float>::max();\r
- trainIdx.ptr(queryIdx)[i] = bestIdx;\r
- distance.ptr(queryIdx)[i] = dist;\r
+ if (mask.data)\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+ else\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
}\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE> \r
-void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, 1, 1);\r
- const dim3 grid(trainIdx.rows, 1, 1);\r
-\r
- for (int i = 0; i < k; ++i)\r
- {\r
- findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
\r
-void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)\r
-{\r
- findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);\r
-}\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,\r
+ const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+ else\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// knn match Dispatcher\r
+ template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
\r
-template <typename Dist, typename T, typename Mask>\r
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (k == 2)\r
- {\r
- match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);\r
- }\r
- else\r
- {\r
- calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);\r
- findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);\r
- }\r
-} \r
+ template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+ else\r
+ match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// knn match caller\r
+ template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
\r
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
- else\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-}\r
-\r
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,\r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
- else\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-}\r
-\r
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,\r
- const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
- else\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-}\r
-\r
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
- else\r
- match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-}\r
-\r
-template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
- else\r
- match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-}\r
-\r
-//template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
- else\r
- match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-}\r
+ template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+ else\r
+ match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+ }\r
\r
-template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-//template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
\r
-} // namespace bf_knnmatch\r
+ template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+ else\r
+ match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+ } // namespace bf_knnmatch\r
+}}} // namespace cv { namespace gpu { namespace device {\r
#include "opencv2/gpu/device/vec_distance.hpp"\r
#include "opencv2/gpu/device/datamov_utils.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ namespace bf_match \r
+ {\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Reduction\r
\r
-namespace bf_match {\r
+ template <int BLOCK_SIZE> \r
+ __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)\r
+ {\r
+ s_distance += threadIdx.y * BLOCK_SIZE;\r
+ s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Reduction\r
+ s_distance[threadIdx.x] = bestDistance;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx;\r
\r
-template <int BLOCK_SIZE> \r
-__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)\r
-{\r
- s_distance += threadIdx.y * BLOCK_SIZE;\r
- s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+ __syncthreads();\r
\r
- s_distance[threadIdx.x] = bestDistance;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx;\r
+ reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());\r
+ }\r
\r
- __syncthreads();\r
+ template <int BLOCK_SIZE> \r
+ __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
+ {\r
+ s_distance += threadIdx.y * BLOCK_SIZE;\r
+ s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+ s_imgIdx += threadIdx.y * BLOCK_SIZE;\r
\r
- reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());\r
-}\r
+ s_distance[threadIdx.x] = bestDistance;\r
+ s_trainIdx[threadIdx.x] = bestTrainIdx;\r
+ s_imgIdx [threadIdx.x] = bestImgIdx;\r
\r
-template <int BLOCK_SIZE> \r
-__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
-{\r
- s_distance += threadIdx.y * BLOCK_SIZE;\r
- s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
- s_imgIdx += threadIdx.y * BLOCK_SIZE;\r
+ __syncthreads();\r
\r
- s_distance[threadIdx.x] = bestDistance;\r
- s_trainIdx[threadIdx.x] = bestTrainIdx;\r
- s_imgIdx [threadIdx.x] = bestImgIdx;\r
+ reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());\r
+ }\r
\r
- __syncthreads();\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match Unrolled Cached\r
\r
- reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());\r
-}\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
+ __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
+ {\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
+ }\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match Unrolled Cached\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+ {\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+ {\r
+ Dist dist;\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
-__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
-{\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
- s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ if (loadX < train.cols)\r
+ {\r
+ T val;\r
\r
- if (loadX < train.cols)\r
- {\r
- T val;\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
- }\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ __syncthreads();\r
+ }\r
\r
- __syncthreads();\r
- }\r
+ typename Dist::result_type distVal = dist;\r
\r
- typename Dist::result_type distVal = dist;\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+ if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+ {\r
+ bestImgIdx = imgIdx;\r
+ bestDistance = distVal;\r
+ bestTrainIdx = trainIdx;\r
+ }\r
+ }\r
+ }\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
{\r
- bestImgIdx = imgIdx;\r
- bestDistance = distVal;\r
- bestTrainIdx = trainIdx;\r
- }\r
- }\r
-}\r
+ extern __shared__ int smem[];\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+ loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
\r
- loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
+ loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
\r
- loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
+ }\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
- int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+ int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
\r
- loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+ loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
- int myBestImgIdx = -1;\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
+ int myBestImgIdx = -1;\r
\r
- Mask m = mask;\r
+ Mask m = mask;\r
\r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
- }\r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+ }\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestImgIdx[queryIdx] = myBestImgIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestImgIdx[queryIdx] = myBestImgIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
+ }\r
\r
- const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match Unrolled\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match Unrolled\r
\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
{\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+ {\r
+ Dist dist;\r
\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
\r
- if (loadX < query.cols)\r
- {\r
- T val;\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
+\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+ __syncthreads();\r
+ }\r
\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+ typename Dist::result_type distVal = dist;\r
\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+\r
+ if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+ {\r
+ bestImgIdx = imgIdx;\r
+ bestDistance = distVal;\r
+ bestTrainIdx = trainIdx;\r
+ }\r
}\r
+ }\r
\r
- __syncthreads();\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+ __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
+\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ \r
+ loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
\r
__syncthreads();\r
- }\r
\r
- typename Dist::result_type distVal = dist;\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
+ }\r
+\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
{\r
- bestImgIdx = imgIdx;\r
- bestDistance = distVal;\r
- bestTrainIdx = trainIdx;\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- }\r
-}\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+ __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+ int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
+ int myBestImgIdx = -1;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- \r
- loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- __syncthreads();\r
+ Mask m = mask;\r
+ \r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+ }\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ __syncthreads();\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestImgIdx[queryIdx] = myBestImgIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
+ }\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
- int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
- int myBestImgIdx = -1;\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+ typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+ float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+ {\r
+ for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+ {\r
+ Dist dist;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
\r
- Mask m = mask;\r
- \r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
- }\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
\r
- __syncthreads();\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
+ ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestImgIdx[queryIdx] = myBestImgIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ __syncthreads();\r
\r
- const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __syncthreads();\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ typename Dist::result_type distVal = dist;\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match\r
+ const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
- typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
- float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
-{\r
- for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
- {\r
- Dist dist;\r
+ if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+ {\r
+ bestImgIdx = imgIdx;\r
+ bestDistance = distVal;\r
+ bestTrainIdx = trainIdx;\r
+ }\r
+ }\r
+ }\r
\r
- for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+ __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
{\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ extern __shared__ int smem[];\r
\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- if (loadX < query.cols)\r
- {\r
- T val;\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
-\r
- ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
- }\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ \r
+ loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
\r
__syncthreads();\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- __syncthreads();\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
}\r
\r
- typename Dist::result_type distVal = dist;\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+ match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
- {\r
- bestImgIdx = imgIdx;\r
- bestDistance = distVal;\r
- bestTrainIdx = trainIdx;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- }\r
-}\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+ __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+ int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+ {\r
+ extern __shared__ int smem[];\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
+ float myBestDistance = numeric_limits<float>::max();\r
+ int myBestTrainIdx = -1;\r
+ int myBestImgIdx = -1;\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- \r
- loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- __syncthreads();\r
+ Mask m = mask;\r
+ for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+ {\r
+ const DevMem2D_<T> train = trains[imgIdx];\r
+ m.next();\r
+ loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+ }\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ __syncthreads();\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+ float* s_distance = (float*)(smem);\r
+ int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
\r
- match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (queryIdx < query.rows && threadIdx.x == 0)\r
+ {\r
+ bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+ bestImgIdx[queryIdx] = myBestImgIdx;\r
+ bestDistance[queryIdx] = myBestDistance;\r
+ }\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
- int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-{\r
- extern __shared__ int smem[];\r
+ const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+ match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- float myBestDistance = numeric_limits<float>::max();\r
- int myBestTrainIdx = -1;\r
- int myBestImgIdx = -1;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match dispatcher\r
\r
- Mask m = mask;\r
- for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
- {\r
- const DevMem2D_<T> train = trains[imgIdx];\r
- m.next();\r
- loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
- }\r
+ template <typename Dist, typename T, typename Mask> \r
+ void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
+ }\r
\r
- __syncthreads();\r
+ template <typename Dist, typename T, typename Mask> \r
+ void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+ }\r
+ }\r
\r
- float* s_distance = (float*)(smem);\r
- int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
- int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match caller\r
\r
- findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
- if (queryIdx < query.rows && threadIdx.x == 0)\r
- {\r
- bestTrainIdx[queryIdx] = myBestTrainIdx;\r
- bestImgIdx[queryIdx] = myBestImgIdx;\r
- bestDistance[queryIdx] = myBestDistance;\r
- }\r
-}\r
-\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+ template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
\r
- const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
- match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+ trainIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match dispatcher\r
+ template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
\r
-template <typename Dist, typename T, typename Mask> \r
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, train, mask, trainIdx, distance, stream);\r
- }\r
-}\r
-\r
-template <typename Dist, typename T, typename Mask> \r
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
- }\r
-}\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match caller\r
+ template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
\r
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
-\r
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
-\r
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
- trainIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
-\r
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
-\r
-template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
-\r
-//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (masks.data)\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
- trainIdx, imgIdx, distance, \r
- cc, stream);\r
- }\r
-}\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
-template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
\r
-} // namespace bf_match\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (masks.data)\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+ trainIdx, imgIdx, distance, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+ } // namespace bf_match\r
+}}} // namespace cv { namespace gpu { namespace device {\r
#include "opencv2/gpu/device/vec_distance.hpp"\r
#include "opencv2/gpu/device/datamov_utils.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace bf_radius_match {\r
-\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match Unrolled\r
-\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
-__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
- PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- #if __CUDA_ARCH__ >= 110\r
-\r
- extern __shared__ int smem[];\r
-\r
- const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
- const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
-\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ namespace bf_radius_match \r
+ {\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match Unrolled\r
\r
- Dist dist;\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
+ __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
+ PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
+ {\r
+ #if __CUDA_ARCH__ >= 110\r
\r
- #pragma unroll\r
- for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ extern __shared__ int smem[];\r
\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+ const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
\r
- if (loadX < query.cols)\r
- {\r
- T val;\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+ Dist dist;\r
\r
- ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
- }\r
+ #pragma unroll\r
+ for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
\r
- __syncthreads();\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
\r
- __syncthreads();\r
- }\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
\r
- float distVal = (typename Dist::result_type)dist;\r
+ ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
- {\r
- unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
- if (ind < maxCount)\r
- {\r
- bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
- if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
- bestDistance.ptr(queryIdx)[ind] = distVal;\r
- }\r
- }\r
+ __syncthreads();\r
\r
- #endif\r
-}\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ __syncthreads();\r
+ }\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ float distVal = (typename Dist::result_type)dist;\r
\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
- trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
+ {\r
+ unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+ if (ind < maxCount)\r
+ {\r
+ bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
+ if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
+ bestDistance.ptr(queryIdx)[ind] = distVal;\r
+ }\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-} \r
+ #endif\r
+ }\r
\r
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> \r
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- for (int i = 0; i < n; ++i)\r
- {\r
- const DevMem2D_<T> train = trains[i];\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
+ trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ } \r
\r
- if (masks != 0 && masks[i].data)\r
- {\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
- trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
- }\r
- else\r
+ template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> \r
+ void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ cudaStream_t stream)\r
{\r
- matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
- trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+\r
+ for (int i = 0; i < n; ++i)\r
+ {\r
+ const DevMem2D_<T> train = trains[i];\r
+\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+\r
+ if (masks != 0 && masks[i].data)\r
+ {\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
+ trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ }\r
+ else\r
+ {\r
+ matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
+ trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match\r
+ template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
+ __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
+ PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
+ {\r
+ #if __CUDA_ARCH__ >= 110\r
\r
-template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
-__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
- PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
-{\r
- #if __CUDA_ARCH__ >= 110\r
+ extern __shared__ int smem[];\r
\r
- extern __shared__ int smem[];\r
+ const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+ const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
\r
- const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
- const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+ typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+ typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
\r
- typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
- typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+ Dist dist;\r
\r
- Dist dist;\r
+ for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+ {\r
+ const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
\r
- for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
- {\r
- const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+ if (loadX < query.cols)\r
+ {\r
+ T val;\r
\r
- if (loadX < query.cols)\r
- {\r
- T val;\r
+ ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+ s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
\r
- ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
- s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+ ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+ s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+ }\r
\r
- ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
- s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
- }\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int j = 0; j < BLOCK_SIZE; ++j)\r
+ dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
\r
- #pragma unroll\r
- for (int j = 0; j < BLOCK_SIZE; ++j)\r
- dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+ __syncthreads();\r
+ }\r
\r
- __syncthreads();\r
- }\r
+ float distVal = (typename Dist::result_type)dist;\r
\r
- float distVal = (typename Dist::result_type)dist;\r
+ if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
+ {\r
+ unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+ if (ind < maxCount)\r
+ {\r
+ bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
+ if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
+ bestDistance.ptr(queryIdx)[ind] = distVal;\r
+ }\r
+ }\r
\r
- if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
- {\r
- unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
- if (ind < maxCount)\r
- {\r
- bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
- if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
- bestDistance.ptr(queryIdx)[ind] = distVal;\r
+ #endif\r
}\r
- }\r
\r
- #endif\r
-}\r
+ template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
+ trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
- trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ template <int BLOCK_SIZE, typename Dist, typename T> \r
+ void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ cudaStream_t stream)\r
+ {\r
+ const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+\r
+ const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+\r
+ for (int i = 0; i < n; ++i)\r
+ {\r
+ const DevMem2D_<T> train = trains[i];\r
+\r
+ const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+\r
+ if (masks != 0 && masks[i].data)\r
+ {\r
+ match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
+ trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ }\r
+ else\r
+ {\r
+ match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
+ trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <int BLOCK_SIZE, typename Dist, typename T> \r
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- cudaStream_t stream)\r
-{\r
- const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match dispatcher\r
\r
- const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+ template <typename Dist, typename T, typename Mask> \r
+ void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+ }\r
+ }\r
\r
- for (int i = 0; i < n; ++i)\r
- {\r
- const DevMem2D_<T> train = trains[i];\r
+ template <typename Dist, typename T> \r
+ void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (query.cols <= 64)\r
+ {\r
+ matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 128)\r
+ {\r
+ matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }\r
+ /*else if (query.cols <= 256)\r
+ {\r
+ matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 512)\r
+ { \r
+ matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }\r
+ else if (query.cols <= 1024)\r
+ { \r
+ matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }*/\r
+ else\r
+ {\r
+ match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+ }\r
+ } \r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Radius Match caller\r
+\r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
+ }\r
\r
- const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+ template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
\r
- if (masks != 0 && masks[i].data)\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
{\r
- match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
- trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
}\r
- else\r
+\r
+ //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
{\r
- match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
- trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+ if (mask.data)\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
+ else\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+ trainIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
}\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Match dispatcher\r
+ template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
\r
-template <typename Dist, typename T, typename Mask> \r
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
- }\r
-}\r
-\r
-template <typename Dist, typename T> \r
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (query.cols <= 64)\r
- {\r
- matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 128)\r
- {\r
- matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }\r
- /*else if (query.cols <= 256)\r
- {\r
- matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 512)\r
- { \r
- matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }\r
- else if (query.cols <= 1024)\r
- { \r
- matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }*/\r
- else\r
- {\r
- match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
- }\r
-} \r
+ template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+ trainIdx, imgIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Radius Match caller\r
+ template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
\r
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
-}\r
-\r
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
-}\r
-\r
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
- const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
- else\r
- {\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, distance, nMatches, \r
- cc, stream);\r
- }\r
-}\r
-\r
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
- trainIdx, imgIdx, distance, nMatches, \r
- cc, stream);\r
-}\r
-\r
-template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
- trainIdx, imgIdx, distance, nMatches, \r
- cc, stream);\r
-}\r
-\r
-//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
- const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
- int cc, cudaStream_t stream)\r
-{\r
- matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
- trainIdx, imgIdx, distance, nMatches, \r
- cc, stream);\r
-}\r
+ template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+ trainIdx, imgIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
\r
-template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
\r
-} // namespace bf_radius_match\r
+ template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+ const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+ int cc, cudaStream_t stream)\r
+ {\r
+ matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+ trainIdx, imgIdx, distance, nMatches, \r
+ cc, stream);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+ } // namespace bf_radius_match\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace bilateral_filter {\r
-\r
-__constant__ float* ctable_color;\r
-__constant__ float* ctable_space;\r
-__constant__ size_t ctable_space_step;\r
-\r
-__constant__ int cndisp;\r
-__constant__ int cradius;\r
-\r
-__constant__ short cedge_disc;\r
-__constant__ short cmax_disc;\r
-\r
-void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );\r
- size_t table_space_step = table_space.step / sizeof(float);\r
- cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );\r
-\r
- cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );\r
-\r
- cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );\r
-}\r
-\r
-template <int channels>\r
-struct DistRgbMax\r
-{\r
- static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
- {\r
- uchar x = ::abs(a[0] - b[0]);\r
- uchar y = ::abs(a[1] - b[1]);\r
- uchar z = ::abs(a[2] - b[2]);\r
- return (::max(::max(x, y), z));\r
- }\r
-};\r
-\r
-template <>\r
-struct DistRgbMax<1>\r
+namespace cv { namespace gpu { namespace device \r
{\r
- static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
+ namespace bilateral_filter \r
{\r
- return ::abs(a[0] - b[0]);\r
- }\r
-};\r
+ __constant__ float* ctable_color;\r
+ __constant__ float* ctable_space;\r
+ __constant__ size_t ctable_space_step;\r
\r
-template <int channels, typename T>\r
-__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)\r
-{\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
- const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
-\r
- T dp[5];\r
+ __constant__ int cndisp;\r
+ __constant__ int cradius;\r
\r
- if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
- {\r
- dp[0] = *(disp + (y ) * disp_step + x + 0);\r
- dp[1] = *(disp + (y-1) * disp_step + x + 0);\r
- dp[2] = *(disp + (y ) * disp_step + x - 1);\r
- dp[3] = *(disp + (y+1) * disp_step + x + 0);\r
- dp[4] = *(disp + (y ) * disp_step + x + 1);\r
+ __constant__ short cedge_disc;\r
+ __constant__ short cmax_disc;\r
\r
- if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc) \r
+ void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)\r
{\r
- const int ymin = ::max(0, y - cradius);\r
- const int xmin = ::max(0, x - cradius);\r
- const int ymax = ::min(h - 1, y + cradius);\r
- const int xmax = ::min(w - 1, x + cradius);\r
+ cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );\r
+ size_t table_space_step = table_space.step / sizeof(float);\r
+ cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );\r
\r
- float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};\r
+ cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );\r
\r
- const uchar* ic = img + y * img_step + channels * x;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );\r
+ }\r
\r
- for(int yi = ymin; yi <= ymax; yi++)\r
+ template <int channels>\r
+ struct DistRgbMax\r
+ {\r
+ static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
{\r
- const T* disp_y = disp + yi * disp_step;\r
+ uchar x = ::abs(a[0] - b[0]);\r
+ uchar y = ::abs(a[1] - b[1]);\r
+ uchar z = ::abs(a[2] - b[2]);\r
+ return (::max(::max(x, y), z));\r
+ }\r
+ };\r
\r
- for(int xi = xmin; xi <= xmax; xi++)\r
- {\r
- const uchar* in = img + yi * img_step + channels * xi;\r
+ template <>\r
+ struct DistRgbMax<1>\r
+ {\r
+ static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
+ {\r
+ return ::abs(a[0] - b[0]);\r
+ }\r
+ };\r
\r
- uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);\r
+ template <int channels, typename T>\r
+ __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)\r
+ {\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
\r
- const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];\r
+ T dp[5];\r
\r
- const T disp_reg = disp_y[xi];\r
+ if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
+ {\r
+ dp[0] = *(disp + (y ) * disp_step + x + 0);\r
+ dp[1] = *(disp + (y-1) * disp_step + x + 0);\r
+ dp[2] = *(disp + (y ) * disp_step + x - 1);\r
+ dp[3] = *(disp + (y+1) * disp_step + x + 0);\r
+ dp[4] = *(disp + (y ) * disp_step + x + 1);\r
\r
- cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;\r
- cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;\r
- cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;\r
- cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;\r
- cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;\r
+ if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc) \r
+ {\r
+ const int ymin = ::max(0, y - cradius);\r
+ const int xmin = ::max(0, x - cradius);\r
+ const int ymax = ::min(h - 1, y + cradius);\r
+ const int xmax = ::min(w - 1, x + cradius);\r
+\r
+ float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};\r
+\r
+ const uchar* ic = img + y * img_step + channels * x;\r
+\r
+ for(int yi = ymin; yi <= ymax; yi++)\r
+ {\r
+ const T* disp_y = disp + yi * disp_step;\r
+\r
+ for(int xi = xmin; xi <= xmax; xi++)\r
+ {\r
+ const uchar* in = img + yi * img_step + channels * xi;\r
+\r
+ uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);\r
+\r
+ const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];\r
+\r
+ const T disp_reg = disp_y[xi];\r
+\r
+ cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;\r
+ cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;\r
+ cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;\r
+ cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;\r
+ cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;\r
+ }\r
+ }\r
+\r
+ float minimum = numeric_limits<float>::max();\r
+ int id = 0;\r
+\r
+ if (cost[0] < minimum)\r
+ {\r
+ minimum = cost[0];\r
+ id = 0;\r
+ }\r
+ if (cost[1] < minimum)\r
+ {\r
+ minimum = cost[1];\r
+ id = 1;\r
+ }\r
+ if (cost[2] < minimum)\r
+ {\r
+ minimum = cost[2];\r
+ id = 2;\r
+ }\r
+ if (cost[3] < minimum)\r
+ {\r
+ minimum = cost[3];\r
+ id = 3;\r
+ }\r
+ if (cost[4] < minimum)\r
+ {\r
+ minimum = cost[4];\r
+ id = 4;\r
+ }\r
+\r
+ *(disp + y * disp_step + x) = dp[id];\r
}\r
}\r
+ }\r
\r
- float minimum = numeric_limits<float>::max();\r
- int id = 0;\r
+ template <typename T> \r
+ void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
+ grid.x = divUp(disp.cols, threads.x << 1);\r
+ grid.y = divUp(disp.rows, threads.y);\r
\r
- if (cost[0] < minimum)\r
+ switch (channels)\r
{\r
- minimum = cost[0];\r
- id = 0;\r
- }\r
- if (cost[1] < minimum)\r
- {\r
- minimum = cost[1];\r
- id = 1;\r
- }\r
- if (cost[2] < minimum)\r
- {\r
- minimum = cost[2];\r
- id = 2;\r
- }\r
- if (cost[3] < minimum)\r
- {\r
- minimum = cost[3];\r
- id = 3;\r
- }\r
- if (cost[4] < minimum)\r
- {\r
- minimum = cost[4];\r
- id = 4;\r
+ case 1:\r
+ for (int i = 0; i < iters; ++i)\r
+ {\r
+ bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+ break;\r
+ case 3:\r
+ for (int i = 0; i < iters; ++i)\r
+ {\r
+ bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+ break;\r
+ default:\r
+ cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
}\r
\r
- *(disp + y * disp_step + x) = dp[id];\r
+ if (stream != 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- }\r
-}\r
\r
-template <typename T> \r
-void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
- grid.x = divUp(disp.cols, threads.x << 1);\r
- grid.y = divUp(disp.rows, threads.y);\r
-\r
- switch (channels)\r
- {\r
- case 1:\r
- for (int i = 0; i < iters; ++i)\r
+ void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
{\r
- bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ bilateral_filter_caller(disp, img, channels, iters, stream);\r
}\r
- break;\r
- case 3:\r
- for (int i = 0; i < iters; ++i)\r
- {\r
- bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
+ {\r
+ bilateral_filter_caller(disp, img, channels, iters, stream);\r
}\r
- break;\r
- default:\r
- cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
- }\r
-\r
- if (stream != 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
-{\r
- bilateral_filter_caller(disp, img, channels, iters, stream);\r
-}\r
-\r
-void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
-{\r
- bilateral_filter_caller(disp, img, channels, iters, stream);\r
-}\r
-\r
-} // namespace bilateral_filter\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ } // namespace bilateral_filter\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace blend {\r
-\r
-template <typename T>\r
-__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,\r
- const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if (y < rows && x < cols)\r
+ namespace blend \r
{\r
- int x_ = x / cn;\r
- float w1 = weights1.ptr(y)[x_];\r
- float w2 = weights2.ptr(y)[x_];\r
- T p1 = img1.ptr(y)[x];\r
- T p2 = img2.ptr(y)[x];\r
- result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);\r
- }\r
-} \r
-\r
-template <typename T>\r
-void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));\r
- \r
- blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,\r
+ const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ if (y < rows && x < cols)\r
+ {\r
+ int x_ = x / cn;\r
+ float w1 = weights1.ptr(y)[x_];\r
+ float w2 = weights2.ptr(y)[x_];\r
+ T p1 = img1.ptr(y)[x];\r
+ T p2 = img2.ptr(y)[x];\r
+ result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);\r
+ }\r
+ } \r
\r
-template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);\r
-template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);\r
+ template <typename T>\r
+ void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));\r
+ \r
+ blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
\r
-__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,\r
- const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);\r
+ template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);\r
\r
- if (y < rows && x < cols)\r
- {\r
- float w1 = weights1.ptr(y)[x];\r
- float w2 = weights2.ptr(y)[x];\r
- float sum_inv = 1.f / (w1 + w2 + 1e-5f);\r
- w1 *= sum_inv;\r
- w2 *= sum_inv;\r
- uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];\r
- uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];\r
- ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,\r
- p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);\r
- }\r
-}\r
\r
-void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
- \r
- blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,\r
+ const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ if (y < rows && x < cols)\r
+ {\r
+ float w1 = weights1.ptr(y)[x];\r
+ float w2 = weights2.ptr(y)[x];\r
+ float sum_inv = 1.f / (w1 + w2 + 1e-5f);\r
+ w1 *= sum_inv;\r
+ w2 *= sum_inv;\r
+ uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];\r
+ uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];\r
+ ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,\r
+ p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);\r
+ }\r
+ }\r
\r
-} // namespace blend \r
+ void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ \r
+ blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+ } // namespace blend \r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
-
-namespace transform_points
+namespace cv { namespace gpu { namespace device
{
- __constant__ float3 crot0;
- __constant__ float3 crot1;
- __constant__ float3 crot2;
- __constant__ float3 ctransl;
+ #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
- struct TransformOp : unary_function<float3, float3>
+ namespace transform_points
{
- __device__ __forceinline__ float3 operator()(const float3& p) const
+ __constant__ float3 crot0;
+ __constant__ float3 crot1;
+ __constant__ float3 crot2;
+ __constant__ float3 ctransl;
+
+ struct TransformOp : unary_function<float3, float3>
{
- return make_float3(
- crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
- crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
- crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+ __device__ __forceinline__ float3 operator()(const float3& p) const
+ {
+ return make_float3(
+ crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+ crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+ crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+ }
+ };
+
+ void call(const DevMem2D_<float3> src, const float* rot,
+ const float* transl, DevMem2D_<float3> dst,
+ cudaStream_t stream)
+ {
+ cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+ ::cv::gpu::device::transform(src, dst, TransformOp(), stream);
}
- };
+ } // namespace transform_points
- void call(const DevMem2D_<float3> src, const float* rot,
- const float* transl, DevMem2D_<float3> dst,
- cudaStream_t stream)
- {
- cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
- OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
- }
-} // namespace transform_points
-
-namespace project_points
-{
- __constant__ float3 crot0;
- __constant__ float3 crot1;
- __constant__ float3 crot2;
- __constant__ float3 ctransl;
- __constant__ float3 cproj0;
- __constant__ float3 cproj1;
-
- struct ProjectOp : unary_function<float3, float3>
+ namespace project_points
{
- __device__ __forceinline__ float2 operator()(const float3& p) const
+ __constant__ float3 crot0;
+ __constant__ float3 crot1;
+ __constant__ float3 crot2;
+ __constant__ float3 ctransl;
+ __constant__ float3 cproj0;
+ __constant__ float3 cproj1;
+
+ struct ProjectOp : unary_function<float3, float3>
{
- // Rotate and translate in 3D
- float3 t = make_float3(
- crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
- crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
- crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
- // Project on 2D plane
- return make_float2(
- (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
- (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+ __device__ __forceinline__ float2 operator()(const float3& p) const
+ {
+ // Rotate and translate in 3D
+ float3 t = make_float3(
+ crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+ crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+ crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+ // Project on 2D plane
+ return make_float2(
+ (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+ (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+ }
+ };
+
+ void call(const DevMem2D_<float3> src, const float* rot,
+ const float* transl, const float* proj, DevMem2D_<float2> dst,
+ cudaStream_t stream)
+ {
+ cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+ cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+ ::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
}
- };
+ } // namespace project_points
- void call(const DevMem2D_<float3> src, const float* rot,
- const float* transl, const float* proj, DevMem2D_<float2> dst,
- cudaStream_t stream)
+ namespace solve_pnp_ransac
{
- cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
- cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
- OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
- }
-} // namespace project_points
-
-namespace solve_pnp_ransac
-{
- __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
- __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+ __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+ __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
- int maxNumIters()
- {
- return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
- }
-
- __device__ __forceinline__ float sqr(float x)
- {
- return x * x;
- }
-
- __global__ void computeHypothesisScoresKernel(
- const int num_points, const float3* object, const float2* image,
- const float dist_threshold, int* g_num_inliers)
- {
- const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
- const float3 &transl_vec = ctransl_vectors[blockIdx.x];
- int num_inliers = 0;
-
- for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+ int maxNumIters()
{
- float3 p = object[i];
- p = make_float3(
- rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
- rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
- rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
- p.x /= p.z;
- p.y /= p.z;
- float2 image_p = image[i];
- if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
- ++num_inliers;
+ return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
}
- extern __shared__ float s_num_inliers[];
- s_num_inliers[threadIdx.x] = num_inliers;
- __syncthreads();
+ __device__ __forceinline__ float sqr(float x)
+ {
+ return x * x;
+ }
- for (int step = blockDim.x / 2; step > 0; step >>= 1)
+ __global__ void computeHypothesisScoresKernel(
+ const int num_points, const float3* object, const float2* image,
+ const float dist_threshold, int* g_num_inliers)
{
- if (threadIdx.x < step)
- s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+ const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+ const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+ int num_inliers = 0;
+
+ for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+ {
+ float3 p = object[i];
+ p = make_float3(
+ rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+ rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+ rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+ p.x /= p.z;
+ p.y /= p.z;
+ float2 image_p = image[i];
+ if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+ ++num_inliers;
+ }
+
+ extern __shared__ float s_num_inliers[];
+ s_num_inliers[threadIdx.x] = num_inliers;
__syncthreads();
- }
- if (threadIdx.x == 0)
- g_num_inliers[blockIdx.x] = s_num_inliers[0];
- }
+ for (int step = blockDim.x / 2; step > 0; step >>= 1)
+ {
+ if (threadIdx.x < step)
+ s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+ __syncthreads();
+ }
- void computeHypothesisScores(
- const int num_hypotheses, const int num_points, const float* rot_matrices,
- const float3* transl_vectors, const float3* object, const float2* image,
- const float dist_threshold, int* hypothesis_scores)
- {
- cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
- cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+ if (threadIdx.x == 0)
+ g_num_inliers[blockIdx.x] = s_num_inliers[0];
+ }
- dim3 threads(256);
- dim3 grid(num_hypotheses);
- int smem_size = threads.x * sizeof(float);
+ void computeHypothesisScores(
+ const int num_hypotheses, const int num_points, const float* rot_matrices,
+ const float3* transl_vectors, const float3* object, const float2* image,
+ const float dist_threshold, int* hypothesis_scores)
+ {
+ cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+ cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
- computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
- num_points, object, image, dist_threshold, hypothesis_scores);
- cudaSafeCall( cudaGetLastError() );
+ dim3 threads(256);
+ dim3 grid(num_hypotheses);
+ int smem_size = threads.x * sizeof(float);
- cudaSafeCall( cudaDeviceSynchronize() );
- }
-} // namespace solvepnp_ransac
+ computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+ num_points, object, image, dist_threshold, hypothesis_scores);
+ cudaSafeCall( cudaGetLastError() );
-END_OPENCV_DEVICE_NAMESPACE
+ cudaSafeCall( cudaDeviceSynchronize() );
+ }
+ } // namespace solvepnp_ransac
+}}} // namespace cv { namespace gpu { namespace device
#include <algorithm>\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace canny {\r
-\r
-__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- __shared__ int smem[16][18];\r
-\r
- const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if (i < rows)\r
+ namespace canny \r
{\r
- smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];\r
- if (threadIdx.x == 0)\r
+ __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
{\r
- smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];\r
- smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];\r
+ __shared__ int smem[16][18];\r
+\r
+ const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (i < rows)\r
+ {\r
+ smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];\r
+ if (threadIdx.x == 0)\r
+ {\r
+ smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];\r
+ smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];\r
+ }\r
+ __syncthreads();\r
+\r
+ if (j < cols)\r
+ {\r
+ dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];\r
+ dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];\r
+ }\r
+ }\r
}\r
- __syncthreads();\r
\r
- if (j < cols)\r
+ void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
{\r
- dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];\r
- dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+\r
+ calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall(cudaThreadSynchronize());\r
}\r
- }\r
-}\r
\r
-void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+ struct L1\r
+ {\r
+ static __device__ __forceinline__ float calc(int x, int y)\r
+ {\r
+ return ::abs(x) + ::abs(y);\r
+ }\r
+ };\r
+ struct L2\r
+ {\r
+ static __device__ __forceinline__ float calc(int x, int y)\r
+ {\r
+ return ::sqrtf(x * x + y * y);\r
+ }\r
+ };\r
\r
- calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, \r
+ PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
+ {\r
+ __shared__ int sdx[18][16];\r
+ __shared__ int sdy[18][16];\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
+ const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-struct L1\r
-{\r
- static __device__ __forceinline__ float calc(int x, int y)\r
- {\r
- return ::abs(x) + ::abs(y);\r
- }\r
-};\r
-struct L2\r
-{\r
- static __device__ __forceinline__ float calc(int x, int y)\r
- {\r
- return ::sqrtf(x * x + y * y);\r
- }\r
-};\r
+ if (j < cols)\r
+ {\r
+ sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];\r
+ sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];\r
+ if (threadIdx.y == 0)\r
+ {\r
+ sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];\r
+ sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];\r
\r
-template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, \r
- PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
-{\r
- __shared__ int sdx[18][16];\r
- __shared__ int sdy[18][16];\r
+ sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];\r
+ sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];\r
+ }\r
+ __syncthreads();\r
\r
- const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (i < rows)\r
+ {\r
+ int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];\r
+ int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];\r
\r
- if (j < cols)\r
- {\r
- sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];\r
- sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];\r
- if (threadIdx.y == 0)\r
- {\r
- sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];\r
- sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];\r
+ dx.ptr(i)[j] = x;\r
+ dy.ptr(i)[j] = y;\r
\r
- sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];\r
- sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];\r
+ mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);\r
+ }\r
+ }\r
}\r
- __syncthreads();\r
\r
- if (i < rows)\r
+ void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
{\r
- int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];\r
- int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
\r
- dx.ptr(i)[j] = x;\r
- dy.ptr(i)[j] = y;\r
+ if (L2Grad)\r
+ calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
+ else\r
+ calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
\r
- mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall(cudaThreadSynchronize());\r
}\r
- }\r
-}\r
\r
-void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+ template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
+ {\r
+ const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (L2Grad)\r
- calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
- else\r
- calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
+ if (i < rows && j < cols)\r
+ mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);\r
+ }\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
+ {\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
+ if (L2Grad)\r
+ calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);\r
+ else\r
+ calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);\r
\r
-template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
-{\r
- const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (i < rows && j < cols)\r
- mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);\r
-}\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
\r
-void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+ //////////////////////////////////////////////////////////////////////////////////////////\r
+ \r
+ #define CANNY_SHIFT 15\r
+ #define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)\r
\r
- if (L2Grad)\r
- calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);\r
- else\r
- calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);\r
+ __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
+ {\r
+ __shared__ float smem[18][18];\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ const int j = blockIdx.x * 16 + threadIdx.x;\r
+ const int i = blockIdx.y * 16 + threadIdx.y;\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
+ const int tid = threadIdx.y * 16 + threadIdx.x;\r
+ const int lx = tid % 18;\r
+ const int ly = tid / 18;\r
\r
-//////////////////////////////////////////////////////////////////////////////////////////\r
- \r
-#define CANNY_SHIFT 15\r
-#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)\r
+ if (ly < 14)\r
+ smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
\r
-__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
-{\r
- __shared__ float smem[18][18];\r
+ if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
+ smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
\r
- const int j = blockIdx.x * 16 + threadIdx.x;\r
- const int i = blockIdx.y * 16 + threadIdx.y;\r
+ __syncthreads();\r
\r
- const int tid = threadIdx.y * 16 + threadIdx.x;\r
- const int lx = tid % 18;\r
- const int ly = tid / 18;\r
+ if (i < rows && j < cols)\r
+ {\r
+ int x = dx.ptr(i)[j];\r
+ int y = dy.ptr(i)[j];\r
+ const int s = (x ^ y) < 0 ? -1 : 1;\r
+ const float m = smem[threadIdx.y + 1][threadIdx.x + 1];\r
\r
- if (ly < 14)\r
- smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
+ x = ::abs(x);\r
+ y = ::abs(y);\r
\r
- if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
- smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
+ // 0 - the pixel can not belong to an edge\r
+ // 1 - the pixel might belong to an edge\r
+ // 2 - the pixel does belong to an edge\r
+ int edge_type = 0;\r
\r
- __syncthreads();\r
+ if (m > low_thresh)\r
+ {\r
+ const int tg22x = x * TG22;\r
+ const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);\r
\r
- if (i < rows && j < cols)\r
- {\r
- int x = dx.ptr(i)[j];\r
- int y = dy.ptr(i)[j];\r
- const int s = (x ^ y) < 0 ? -1 : 1;\r
- const float m = smem[threadIdx.y + 1][threadIdx.x + 1];\r
+ y <<= CANNY_SHIFT;\r
\r
- x = ::abs(x);\r
- y = ::abs(y);\r
+ if (y < tg22x)\r
+ {\r
+ if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])\r
+ edge_type = 1 + (int)(m > high_thresh);\r
+ }\r
+ else if( y > tg67x )\r
+ {\r
+ if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])\r
+ edge_type = 1 + (int)(m > high_thresh);\r
+ }\r
+ else\r
+ {\r
+ if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])\r
+ edge_type = 1 + (int)(m > high_thresh);\r
+ }\r
+ }\r
+ \r
+ map.ptr(i + 1)[j + 1] = edge_type;\r
+ }\r
+ }\r
\r
- // 0 - the pixel can not belong to an edge\r
- // 1 - the pixel might belong to an edge\r
- // 2 - the pixel does belong to an edge\r
- int edge_type = 0;\r
+ #undef CANNY_SHIFT\r
+ #undef TG22\r
\r
- if (m > low_thresh)\r
+ void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
{\r
- const int tg22x = x * TG22;\r
- const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
\r
- y <<= CANNY_SHIFT;\r
+ calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (y < tg22x)\r
- {\r
- if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])\r
- edge_type = 1 + (int)(m > high_thresh);\r
- }\r
- else if( y > tg67x )\r
- {\r
- if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])\r
- edge_type = 1 + (int)(m > high_thresh);\r
- }\r
- else\r
- {\r
- if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])\r
- edge_type = 1 + (int)(m > high_thresh);\r
- }\r
+ cudaSafeCall(cudaThreadSynchronize());\r
}\r
- \r
- map.ptr(i + 1)[j + 1] = edge_type;\r
- }\r
-}\r
\r
-#undef CANNY_SHIFT\r
-#undef TG22\r
-\r
-void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+ //////////////////////////////////////////////////////////////////////////////////////////\r
\r
- calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __device__ unsigned int counter = 0;\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////////\r
+ __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)\r
+ {\r
+ #if __CUDA_ARCH__ >= 120\r
\r
-__device__ unsigned int counter = 0;\r
+ __shared__ int smem[18][18];\r
\r
-__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)\r
-{\r
- #if __CUDA_ARCH__ >= 120\r
+ const int j = blockIdx.x * 16 + threadIdx.x;\r
+ const int i = blockIdx.y * 16 + threadIdx.y;\r
\r
- __shared__ int smem[18][18];\r
+ const int tid = threadIdx.y * 16 + threadIdx.x;\r
+ const int lx = tid % 18;\r
+ const int ly = tid / 18; \r
\r
- const int j = blockIdx.x * 16 + threadIdx.x;\r
- const int i = blockIdx.y * 16 + threadIdx.y;\r
+ if (ly < 14)\r
+ smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
\r
- const int tid = threadIdx.y * 16 + threadIdx.x;\r
- const int lx = tid % 18;\r
- const int ly = tid / 18; \r
+ if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
+ smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
\r
- if (ly < 14)\r
- smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
+ __syncthreads();\r
\r
- if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
- smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
+ if (i < rows && j < cols)\r
+ {\r
+ int n;\r
\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int k = 0; k < 16; ++k)\r
+ {\r
+ n = 0;\r
\r
- if (i < rows && j < cols)\r
- {\r
- int n;\r
+ if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)\r
+ {\r
+ n += smem[threadIdx.y ][threadIdx.x ] == 2;\r
+ n += smem[threadIdx.y ][threadIdx.x + 1] == 2;\r
+ n += smem[threadIdx.y ][threadIdx.x + 2] == 2;\r
+ \r
+ n += smem[threadIdx.y + 1][threadIdx.x ] == 2;\r
+ n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;\r
+ \r
+ n += smem[threadIdx.y + 2][threadIdx.x ] == 2;\r
+ n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;\r
+ n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;\r
+ }\r
\r
- #pragma unroll\r
- for (int k = 0; k < 16; ++k)\r
- {\r
- n = 0;\r
+ if (n > 0)\r
+ smem[threadIdx.y + 1][threadIdx.x + 1] = 2;\r
+ }\r
\r
- if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)\r
- {\r
- n += smem[threadIdx.y ][threadIdx.x ] == 2;\r
- n += smem[threadIdx.y ][threadIdx.x + 1] == 2;\r
- n += smem[threadIdx.y ][threadIdx.x + 2] == 2;\r
- \r
- n += smem[threadIdx.y + 1][threadIdx.x ] == 2;\r
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;\r
- \r
- n += smem[threadIdx.y + 2][threadIdx.x ] == 2;\r
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;\r
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;\r
- }\r
+ const int e = smem[threadIdx.y + 1][threadIdx.x + 1];\r
\r
- if (n > 0)\r
- smem[threadIdx.y + 1][threadIdx.x + 1] = 2;\r
- }\r
+ map.ptr(i + 1)[j + 1] = e;\r
\r
- const int e = smem[threadIdx.y + 1][threadIdx.x + 1];\r
+ n = 0;\r
\r
- map.ptr(i + 1)[j + 1] = e;\r
+ if (e == 2)\r
+ {\r
+ n += smem[threadIdx.y ][threadIdx.x ] == 1;\r
+ n += smem[threadIdx.y ][threadIdx.x + 1] == 1;\r
+ n += smem[threadIdx.y ][threadIdx.x + 2] == 1;\r
+ \r
+ n += smem[threadIdx.y + 1][threadIdx.x ] == 1;\r
+ n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;\r
+ \r
+ n += smem[threadIdx.y + 2][threadIdx.x ] == 1;\r
+ n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;\r
+ n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;\r
+ }\r
\r
- n = 0;\r
+ if (n > 0)\r
+ {\r
+ const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));\r
+ st[ind] = make_ushort2(j + 1, i + 1);\r
+ }\r
+ }\r
\r
- if (e == 2)\r
- {\r
- n += smem[threadIdx.y ][threadIdx.x ] == 1;\r
- n += smem[threadIdx.y ][threadIdx.x + 1] == 1;\r
- n += smem[threadIdx.y ][threadIdx.x + 2] == 1;\r
- \r
- n += smem[threadIdx.y + 1][threadIdx.x ] == 1;\r
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;\r
- \r
- n += smem[threadIdx.y + 2][threadIdx.x ] == 1;\r
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;\r
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;\r
+ #endif\r
}\r
\r
- if (n > 0)\r
+ void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)\r
{\r
- const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));\r
- st[ind] = make_ushort2(j + 1, i + 1);\r
- }\r
- }\r
-\r
- #endif\r
-}\r
-\r
-void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
-\r
- edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
-\r
-__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};\r
-__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};\r
-\r
-__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)\r
-{\r
- #if __CUDA_ARCH__ >= 120\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
\r
- const int stack_size = 512;\r
- \r
- __shared__ unsigned int s_counter;\r
- __shared__ unsigned int s_ind;\r
- __shared__ ushort2 s_st[stack_size];\r
+ edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (threadIdx.x == 0)\r
- s_counter = 0;\r
- __syncthreads();\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
\r
- int ind = blockIdx.y * gridDim.x + blockIdx.x;\r
+ __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};\r
+ __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};\r
\r
- if (ind < count)\r
- {\r
- ushort2 pos = st1[ind];\r
-\r
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
+ __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)\r
{\r
- if (threadIdx.x < 8)\r
- {\r
- pos.x += c_dx[threadIdx.x];\r
- pos.y += c_dy[threadIdx.x];\r
+ #if __CUDA_ARCH__ >= 120\r
\r
- if (map.ptr(pos.y)[pos.x] == 1)\r
- {\r
- map.ptr(pos.y)[pos.x] = 2;\r
-\r
- ind = atomicInc(&s_counter, (unsigned int)(-1));\r
+ const int stack_size = 512;\r
+ \r
+ __shared__ unsigned int s_counter;\r
+ __shared__ unsigned int s_ind;\r
+ __shared__ ushort2 s_st[stack_size];\r
\r
- s_st[ind] = pos;\r
- }\r
- }\r
+ if (threadIdx.x == 0)\r
+ s_counter = 0;\r
__syncthreads();\r
\r
- while (s_counter > 0 && s_counter <= stack_size - blockDim.x)\r
- {\r
- const int subTaskIdx = threadIdx.x >> 3;\r
- const int portion = ::min(s_counter, blockDim.x >> 3);\r
+ int ind = blockIdx.y * gridDim.x + blockIdx.x;\r
\r
- pos.x = pos.y = 0;\r
+ if (ind < count)\r
+ {\r
+ ushort2 pos = st1[ind];\r
\r
- if (subTaskIdx < portion)\r
- pos = s_st[s_counter - 1 - subTaskIdx];\r
- __syncthreads();\r
- \r
- if (threadIdx.x == 0)\r
- s_counter -= portion;\r
- __syncthreads();\r
- \r
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
{\r
- pos.x += c_dx[threadIdx.x & 7];\r
- pos.y += c_dy[threadIdx.x & 7];\r
-\r
- if (map.ptr(pos.y)[pos.x] == 1)\r
+ if (threadIdx.x < 8)\r
{\r
- map.ptr(pos.y)[pos.x] = 2;\r
+ pos.x += c_dx[threadIdx.x];\r
+ pos.y += c_dy[threadIdx.x];\r
\r
- ind = atomicInc(&s_counter, (unsigned int)(-1));\r
+ if (map.ptr(pos.y)[pos.x] == 1)\r
+ {\r
+ map.ptr(pos.y)[pos.x] = 2;\r
\r
- s_st[ind] = pos;\r
- }\r
- }\r
- __syncthreads();\r
- }\r
+ ind = atomicInc(&s_counter, (unsigned int)(-1));\r
\r
- if (s_counter > 0)\r
- {\r
- if (threadIdx.x == 0)\r
- {\r
- ind = atomicAdd(&counter, s_counter);\r
- s_ind = ind - s_counter;\r
- }\r
- __syncthreads();\r
+ s_st[ind] = pos;\r
+ }\r
+ }\r
+ __syncthreads();\r
\r
- ind = s_ind;\r
+ while (s_counter > 0 && s_counter <= stack_size - blockDim.x)\r
+ {\r
+ const int subTaskIdx = threadIdx.x >> 3;\r
+ const int portion = ::min(s_counter, blockDim.x >> 3);\r
+\r
+ pos.x = pos.y = 0;\r
+\r
+ if (subTaskIdx < portion)\r
+ pos = s_st[s_counter - 1 - subTaskIdx];\r
+ __syncthreads();\r
+ \r
+ if (threadIdx.x == 0)\r
+ s_counter -= portion;\r
+ __syncthreads();\r
+ \r
+ if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
+ {\r
+ pos.x += c_dx[threadIdx.x & 7];\r
+ pos.y += c_dy[threadIdx.x & 7];\r
+\r
+ if (map.ptr(pos.y)[pos.x] == 1)\r
+ {\r
+ map.ptr(pos.y)[pos.x] = 2;\r
+\r
+ ind = atomicInc(&s_counter, (unsigned int)(-1));\r
+\r
+ s_st[ind] = pos;\r
+ }\r
+ }\r
+ __syncthreads();\r
+ }\r
\r
- for (int i = threadIdx.x; i < s_counter; i += blockDim.x)\r
- {\r
- st2[ind + i] = s_st[i];\r
+ if (s_counter > 0)\r
+ {\r
+ if (threadIdx.x == 0)\r
+ {\r
+ ind = atomicAdd(&counter, s_counter);\r
+ s_ind = ind - s_counter;\r
+ }\r
+ __syncthreads();\r
+\r
+ ind = s_ind;\r
+\r
+ for (int i = threadIdx.x; i < s_counter; i += blockDim.x)\r
+ {\r
+ st2[ind + i] = s_st[i];\r
+ }\r
+ }\r
}\r
}\r
- }\r
- }\r
-\r
- #endif\r
-}\r
-\r
-void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)\r
-{\r
- void* counter_ptr;\r
- cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );\r
- \r
- unsigned int count;\r
- cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
- while (count > 0)\r
- {\r
- cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );\r
+ #endif\r
+ }\r
\r
- dim3 block(128, 1, 1);\r
- dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);\r
- edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);\r
- cudaSafeCall( cudaGetLastError() );\r
+ void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)\r
+ {\r
+ void* counter_ptr;\r
+ cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );\r
+ \r
+ unsigned int count;\r
+ cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
+ while (count > 0)\r
+ {\r
+ cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );\r
\r
- cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+ dim3 block(128, 1, 1);\r
+ dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);\r
+ edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- std::swap(st1, st2);\r
- }\r
-}\r
+ cudaSafeCall(cudaThreadSynchronize());\r
\r
-__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)\r
-{\r
- const int j = blockIdx.x * 16 + threadIdx.x;\r
- const int i = blockIdx.y * 16 + threadIdx.y;\r
+ cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
\r
- if (i < rows && j < cols)\r
- dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));\r
-}\r
+ std::swap(st1, st2);\r
+ }\r
+ }\r
\r
-void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)\r
-{\r
- dim3 block(16, 16, 1);\r
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+ __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)\r
+ {\r
+ const int j = blockIdx.x * 16 + threadIdx.x;\r
+ const int i = blockIdx.y * 16 + threadIdx.y;\r
\r
- getEdges<<<grid, block>>>(map, dst, rows, cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (i < rows && j < cols)\r
+ dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));\r
+ }\r
\r
- cudaSafeCall(cudaThreadSynchronize());\r
-}\r
+ void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)\r
+ {\r
+ dim3 block(16, 16, 1);\r
+ dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
\r
-} // namespace canny\r
+ getEdges<<<grid, block>>>(map, dst, rows, cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+ }\r
+ } // namespace canny\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/transform.hpp"\r
#include "opencv2/gpu/device/color.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_x = 8 };\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-}; \r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ }; \r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \\r
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \\r
traits::functor_type functor = traits::create_functor(); \\r
typedef typename traits::functor_type::argument_type src_t; \\r
typedef typename traits::functor_type::result_type dst_t; \\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \\r
+ ::cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \\r
}\r
\r
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \\r
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \\r
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)\r
\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)\r
-\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)\r
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR\r
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE\r
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL\r
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)\r
+ OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR\r
+ #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE\r
+ #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL\r
+ #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/limits.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#define MAX_KERNEL_SIZE 16\r
-#define BLOCK_DIM_X 16\r
-#define BLOCK_DIM_Y 4\r
-#define RESULT_STEPS 8\r
-#define HALO_STEPS 1\r
-\r
-namespace column_filter {\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ #define MAX_KERNEL_SIZE 16\r
+ #define BLOCK_DIM_X 16\r
+ #define BLOCK_DIM_Y 4\r
+ #define RESULT_STEPS 8\r
+ #define HALO_STEPS 1\r
\r
-__constant__ float c_kernel[MAX_KERNEL_SIZE];\r
+ namespace column_filter \r
+ {\r
+ __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
\r
-void loadKernel(const float kernel[], int ksize)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
-}\r
+ void loadKernel(const float kernel[], int ksize)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+ }\r
\r
-template <int KERNEL_SIZE, typename T, typename D, typename B>\r
-__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
-{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
+ template <int KERNEL_SIZE, typename T, typename D, typename B>\r
+ __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
\r
- __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];\r
+ __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];\r
\r
- //Offset to the upper halo edge\r
- const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;\r
- const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;\r
+ //Offset to the upper halo edge\r
+ const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;\r
+ const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;\r
\r
- if (x < src.cols)\r
- {\r
- const T* src_col = src.ptr() + x;\r
+ if (x < src.cols)\r
+ {\r
+ const T* src_col = src.ptr() + x;\r
\r
- //Main data\r
- #pragma unroll\r
- for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
- smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+ //Main data\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+ smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
\r
- //Upper halo\r
- #pragma unroll\r
- for(int i = 0; i < HALO_STEPS; ++i)\r
- smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+ //Upper halo\r
+ #pragma unroll\r
+ for(int i = 0; i < HALO_STEPS; ++i)\r
+ smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);\r
\r
- //Lower halo\r
- #pragma unroll\r
- for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
- smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+ //Lower halo\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
+ smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- #pragma unroll\r
- for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
- {\r
- sum_t sum = VecTraits<sum_t>::all(0);\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+ {\r
+ sum_t sum = VecTraits<sum_t>::all(0);\r
\r
- #pragma unroll\r
- for(int j = 0; j < KERNEL_SIZE; ++j)\r
- sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];\r
+ #pragma unroll\r
+ for(int j = 0; j < KERNEL_SIZE; ++j)\r
+ sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];\r
\r
- int dstY = y + i * BLOCK_DIM_Y;\r
+ int dstY = y + i * BLOCK_DIM_Y;\r
\r
- if (dstY < src.rows)\r
- dst.ptr(dstY)[x] = saturate_cast<D>(sum);\r
+ if (dstY < src.rows)\r
+ dst.ptr(dstY)[x] = saturate_cast<D>(sum);\r
+ }\r
+ }\r
}\r
- }\r
-}\r
\r
-template <int ksize, typename T, typename D, template<typename> class B>\r
-void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
-{ \r
- const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
- const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));\r
+ template <int ksize, typename T, typename D, template<typename> class B>\r
+ void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
+ { \r
+ const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
+ const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));\r
\r
- B<T> b(src.rows);\r
+ B<T> b(src.rows);\r
\r
- linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
- cudaSafeCall( cudaGetLastError() );\r
+ linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T, typename D>\r
-void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
- static const caller_t callers[5][17] = \r
- {\r
- {\r
- 0, \r
- linearColumnFilter_caller<1 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<2 , T, D, BrdColReflect101>,\r
- linearColumnFilter_caller<3 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<4 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<5 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<6 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<7 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<8 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<9 , T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<10, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<11, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<12, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<13, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<14, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<15, T, D, BrdColReflect101>, \r
- linearColumnFilter_caller<16, T, D, BrdColReflect101> \r
- },\r
- {\r
- 0, \r
- linearColumnFilter_caller<1 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<2 , T, D, BrdColReplicate>,\r
- linearColumnFilter_caller<3 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<4 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<5 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<6 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<7 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<8 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<9 , T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<10, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<11, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<12, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<13, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<14, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<15, T, D, BrdColReplicate>, \r
- linearColumnFilter_caller<16, T, D, BrdColReplicate>\r
- },\r
+ template <typename T, typename D>\r
+ void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
{\r
- 0, \r
- linearColumnFilter_caller<1 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<2 , T, D, BrdColConstant>,\r
- linearColumnFilter_caller<3 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<4 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<5 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<6 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<7 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<8 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<9 , T, D, BrdColConstant>, \r
- linearColumnFilter_caller<10, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<11, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<12, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<13, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<14, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<15, T, D, BrdColConstant>, \r
- linearColumnFilter_caller<16, T, D, BrdColConstant> \r
- },\r
- {\r
- 0, \r
- linearColumnFilter_caller<1 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<2 , T, D, BrdColReflect>,\r
- linearColumnFilter_caller<3 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<4 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<5 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<6 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<7 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<8 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<9 , T, D, BrdColReflect>, \r
- linearColumnFilter_caller<10, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<11, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<12, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<13, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<14, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<15, T, D, BrdColReflect>, \r
- linearColumnFilter_caller<16, T, D, BrdColReflect>\r
- },\r
- {\r
- 0, \r
- linearColumnFilter_caller<1 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<2 , T, D, BrdColWrap>,\r
- linearColumnFilter_caller<3 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<4 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<5 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<6 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<7 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<8 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<9 , T, D, BrdColWrap>, \r
- linearColumnFilter_caller<10, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<11, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<12, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<13, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<14, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<15, T, D, BrdColWrap>, \r
- linearColumnFilter_caller<16, T, D, BrdColWrap>,\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
+ static const caller_t callers[5][17] = \r
+ {\r
+ {\r
+ 0, \r
+ linearColumnFilter_caller<1 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<2 , T, D, BrdColReflect101>,\r
+ linearColumnFilter_caller<3 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<4 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<5 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<6 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<7 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<8 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<9 , T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<10, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<11, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<12, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<13, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<14, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<15, T, D, BrdColReflect101>, \r
+ linearColumnFilter_caller<16, T, D, BrdColReflect101> \r
+ },\r
+ {\r
+ 0, \r
+ linearColumnFilter_caller<1 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<2 , T, D, BrdColReplicate>,\r
+ linearColumnFilter_caller<3 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<4 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<5 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<6 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<7 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<8 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<9 , T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<10, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<11, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<12, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<13, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<14, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<15, T, D, BrdColReplicate>, \r
+ linearColumnFilter_caller<16, T, D, BrdColReplicate>\r
+ },\r
+ {\r
+ 0, \r
+ linearColumnFilter_caller<1 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<2 , T, D, BrdColConstant>,\r
+ linearColumnFilter_caller<3 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<4 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<5 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<6 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<7 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<8 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<9 , T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<10, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<11, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<12, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<13, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<14, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<15, T, D, BrdColConstant>, \r
+ linearColumnFilter_caller<16, T, D, BrdColConstant> \r
+ },\r
+ {\r
+ 0, \r
+ linearColumnFilter_caller<1 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<2 , T, D, BrdColReflect>,\r
+ linearColumnFilter_caller<3 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<4 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<5 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<6 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<7 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<8 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<9 , T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<10, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<11, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<12, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<13, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<14, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<15, T, D, BrdColReflect>, \r
+ linearColumnFilter_caller<16, T, D, BrdColReflect>\r
+ },\r
+ {\r
+ 0, \r
+ linearColumnFilter_caller<1 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<2 , T, D, BrdColWrap>,\r
+ linearColumnFilter_caller<3 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<4 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<5 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<6 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<7 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<8 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<9 , T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<10, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<11, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<12, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<13, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<14, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<15, T, D, BrdColWrap>, \r
+ linearColumnFilter_caller<16, T, D, BrdColWrap>,\r
+ }\r
+ };\r
+ \r
+ loadKernel(kernel, ksize);\r
+\r
+ callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
}\r
- };\r
- \r
- loadKernel(kernel, ksize);\r
-\r
- callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
-}\r
-\r
-template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-\r
-} // namespace column_filter\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ } // namespace column_filter\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
-\r
-template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < dst.cols && y < dst.rows)\r
- dst.ptr(y)[x] = src(y - top, x - left);\r
-}\r
-\r
-template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher\r
-{\r
- static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, \r
- const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)\r
- { \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));\r
- BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);\r
-\r
- copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-};\r
-\r
-template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, \r
- const T* borderValue, cudaStream_t stream)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef typename TypeVec<T, cn>::vec_type vec_type;\r
-\r
- typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);\r
-\r
- static const caller_t callers[5] = \r
+ namespace imgproc \r
{\r
- CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, \r
- CopyMakeBorderDispatcher<BrdWrap, vec_type>::call \r
- };\r
-\r
- callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);\r
-}\r
-\r
-template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-\r
-//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-\r
-template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-\r
-template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-\r
-//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-\r
-template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-\r
-} // namespace imgproc\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < dst.cols && y < dst.rows)\r
+ dst.ptr(y)[x] = src(y - top, x - left);\r
+ }\r
+\r
+ template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher\r
+ {\r
+ static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, \r
+ const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)\r
+ { \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));\r
+ BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);\r
+\r
+ copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ };\r
+\r
+ template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, \r
+ const T* borderValue, cudaStream_t stream)\r
+ {\r
+ typedef typename TypeVec<T, cn>::vec_type vec_type;\r
+\r
+ typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);\r
+\r
+ static const caller_t callers[5] = \r
+ {\r
+ CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, \r
+ CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, \r
+ CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, \r
+ CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, \r
+ CopyMakeBorderDispatcher<BrdWrap, vec_type>::call \r
+ };\r
+\r
+ callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);\r
+ }\r
+\r
+ template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+\r
+ //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+\r
+ template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+\r
+ template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+\r
+ //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+\r
+ template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+ //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+ template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/limits.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // add\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// add\r
+ template <typename T, typename D> struct Add : binary_function<T, T, D>\r
+ {\r
+ __device__ __forceinline__ D operator ()(T a, T b) const\r
+ {\r
+ return saturate_cast<D>(a + b);\r
+ }\r
+ };\r
\r
-template <typename T, typename D> struct Add : binary_function<T, T, D>\r
-{\r
- __device__ __forceinline__ D operator ()(T a, T b) const\r
+ template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >\r
{\r
- return saturate_cast<D>(a + b);\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+ {\r
+ if (mask.data)\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);\r
+ else\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);\r
}\r
-};\r
\r
-template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ template <typename T, typename D> struct AddScalar : unary_function<T, D>\r
+ {\r
+ AddScalar(double val_) : val(val_) {}\r
+ __device__ __forceinline__ D operator ()(T a) const\r
+ {\r
+ return saturate_cast<D>(a + val);\r
+ }\r
+ const double val;\r
+ };\r
\r
-template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);\r
- else\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);\r
-}\r
-\r
-template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-template <typename T, typename D> struct AddScalar : unary_function<T, D>\r
-{\r
- AddScalar(double val_) : val(val_) {}\r
- __device__ __forceinline__ D operator ()(T a) const\r
+ template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
{\r
- return saturate_cast<D>(a + val);\r
+ cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+ AddScalar<T, D> op(val);\r
+ if (mask.data)\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
+ else\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
}\r
- const double val;\r
-};\r
\r
-template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // subtract\r
+\r
+ template <typename T, typename D> struct Subtract : binary_function<T, T, D>\r
+ {\r
+ __device__ __forceinline__ D operator ()(T a, T b) const\r
+ {\r
+ return saturate_cast<D>(a - b);\r
+ }\r
+ };\r
\r
-template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
- AddScalar<T, D> op(val);\r
- if (mask.data)\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
- else\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// subtract\r
-\r
-template <typename T, typename D> struct Subtract : binary_function<T, T, D>\r
-{\r
- __device__ __forceinline__ D operator ()(T a, T b) const\r
+ template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
{\r
- return saturate_cast<D>(a - b);\r
+ if (mask.data)\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);\r
+ else\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);\r
}\r
-};\r
\r
-template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ template <typename T, typename D> struct SubtractScalar : unary_function<T, D>\r
+ {\r
+ SubtractScalar(double val_) : val(val_) {}\r
+ __device__ __forceinline__ D operator ()(T a) const\r
+ {\r
+ return saturate_cast<D>(a - val);\r
+ }\r
+ const double val;\r
+ };\r
\r
-template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
-{\r
- if (mask.data)\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);\r
- else\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);\r
-}\r
-\r
-template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-template <typename T, typename D> struct SubtractScalar : unary_function<T, D>\r
-{\r
- SubtractScalar(double val_) : val(val_) {}\r
- __device__ __forceinline__ D operator ()(T a) const\r
+ template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >\r
{\r
- return saturate_cast<D>(a - val);\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+ SubtractScalar<T, D> op(val);\r
+ if (mask.data)\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
+ else\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
}\r
- const double val;\r
-};\r
\r
-template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ //template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+ template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // multiply\r
+\r
+ struct multiply_8uc4_32f : binary_function<uint, float, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator ()(uint a, float b) const\r
+ {\r
+ uint res = 0;\r
\r
-template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
- SubtractScalar<T, D> op(val);\r
- if (mask.data)\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
- else\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-//template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// multiply\r
-\r
-struct multiply_8uc4_32f : binary_function<uint, float, uint>\r
-{\r
- __device__ __forceinline__ uint operator ()(uint a, float b) const\r
+ res |= (saturate_cast<uchar>((0xffu & (a )) * b) );\r
+ res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);\r
+ res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);\r
+ res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);\r
+\r
+ return res;\r
+ }\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)\r
{\r
- uint res = 0;\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
+\r
+ void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);\r
+ }\r
\r
- res |= (saturate_cast<uchar>((0xffu & (a )) * b) );\r
- res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);\r
- res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);\r
- res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);\r
+ struct multiply_16sc4_32f : binary_function<short4, float, short4>\r
+ {\r
+ __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
+ {\r
+ return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),\r
+ saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));\r
+ }\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)\r
+ {\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
\r
- return res;\r
+ void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);\r
}\r
-};\r
\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)\r
-{\r
- enum { smart_block_dim_x = 8 };\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 8 };\r
-};\r
+ template <typename T, typename D> struct Multiply : binary_function<T, T, D>\r
+ {\r
+ Multiply(double scale_) : scale(scale_) {}\r
+ __device__ __forceinline__ D operator ()(T a, T b) const\r
+ {\r
+ return saturate_cast<D>(scale * a * b);\r
+ }\r
+ const double scale;\r
+ };\r
\r
-void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
-{\r
- transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);\r
-}\r
+ template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
-struct multiply_16sc4_32f : binary_function<short4, float, short4>\r
-{\r
- __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
+ template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
{\r
- return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),\r
- saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));\r
+ cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+ Multiply<T, D> op(scale);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
}\r
-};\r
\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)\r
-{\r
- enum { smart_block_dim_x = 8 };\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 8 };\r
-};\r
+ template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>\r
+ {\r
+ MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
+ __device__ __forceinline__ D operator ()(T a) const\r
+ {\r
+ return saturate_cast<D>(scale * a * val);\r
+ }\r
+ const double val;\r
+ const double scale;\r
+ };\r
\r
-void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
-{\r
- transform(static_cast< DevMem2D_<short4> >(src1), src2, \r
- static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);\r
-}\r
+ template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
-template <typename T, typename D> struct Multiply : binary_function<T, T, D>\r
-{\r
- Multiply(double scale_) : scale(scale_) {}\r
- __device__ __forceinline__ D operator ()(T a, T b) const\r
+ template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
{\r
- return saturate_cast<D>(scale * a * b);\r
+ cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+ MultiplyScalar<T, D> op(val, scale);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
}\r
- const double scale;\r
-};\r
\r
-template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<float, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // divide\r
+\r
+ struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>\r
+ {\r
+ __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const\r
+ {\r
+ return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),\r
+ saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) \r
+ : make_uchar4(0,0,0,0);\r
+ }\r
+ };\r
\r
-template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
- Multiply<T, D> op(scale);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>\r
-{\r
- MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
- __device__ __forceinline__ D operator ()(T a) const\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)\r
{\r
- return saturate_cast<D>(scale * a * val);\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
+\r
+ void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+ {\r
+ transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);\r
}\r
- const double val;\r
- const double scale;\r
-};\r
\r
-template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
\r
-template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
- MultiplyScalar<T, D> op(val, scale);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<float, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// divide\r
-\r
-struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>\r
-{\r
- __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const\r
+ struct divide_16sc4_32f : binary_function<short4, float, short4>\r
+ {\r
+ __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
+ {\r
+ return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),\r
+ saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))\r
+ : make_short4(0,0,0,0);\r
+ }\r
+ };\r
+\r
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)\r
+ {\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
+\r
+ void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
{\r
- return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),\r
- saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) \r
- : make_uchar4(0,0,0,0);\r
+ transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);\r
}\r
-};\r
\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)\r
-{\r
- enum { smart_block_dim_x = 8 };\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 8 };\r
-};\r
+ template <typename T, typename D> struct Divide : binary_function<T, T, D>\r
+ {\r
+ Divide(double scale_) : scale(scale_) {}\r
+ __device__ __forceinline__ D operator ()(T a, T b) const\r
+ {\r
+ return b != 0 ? saturate_cast<D>(scale * a / b) : 0;\r
+ }\r
+ const double scale;\r
+ };\r
\r
-void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
-{\r
- transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);\r
-}\r
+ template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
+ template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+ Divide<T, D> op(scale);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+ }\r
\r
-struct divide_16sc4_32f : binary_function<short4, float, short4>\r
-{\r
- __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
+ template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ template <typename T, typename D> struct DivideScalar : unary_function<T, D>\r
+ {\r
+ DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
+ __device__ __forceinline__ D operator ()(T a) const\r
+ {\r
+ return saturate_cast<D>(scale * a / val);\r
+ }\r
+ const double val;\r
+ const double scale;\r
+ };\r
+\r
+ template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
{\r
- return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),\r
- saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))\r
- : make_short4(0,0,0,0);\r
+ cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+ DivideScalar<T, D> op(val, scale);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
}\r
-};\r
\r
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)\r
-{\r
- enum { smart_block_dim_x = 8 };\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 8 };\r
-};\r
+ template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<float, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ //template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+ template <typename T, typename D> struct Reciprocal : unary_function<T, D>\r
+ {\r
+ Reciprocal(double scale_) : scale(scale_) {}\r
+ __device__ __forceinline__ D operator ()(T a) const\r
+ {\r
+ return a != 0 ? saturate_cast<D>(scale / a) : 0;\r
+ }\r
+ const double scale;\r
+ };\r
\r
-void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
-{\r
- transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);\r
-}\r
+ template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
-template <typename T, typename D> struct Divide : binary_function<T, T, D>\r
-{\r
- Divide(double scale_) : scale(scale_) {}\r
- __device__ __forceinline__ D operator ()(T a, T b) const\r
+ template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return b != 0 ? saturate_cast<D>(scale * a / b) : 0;\r
+ cudaSafeCall( cudaSetDoubleForDevice(&scalar) );\r
+ Reciprocal<T, D> op(scalar);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
}\r
- const double scale;\r
-};\r
\r
-template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<uchar, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<ushort, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<short, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<int, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<float, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<double, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // absdiff\r
+\r
+ template <typename T> struct Absdiff : binary_function<T, T, T>\r
+ {\r
+ static __device__ __forceinline__ int abs(int a)\r
+ {\r
+ return ::abs(a);\r
+ }\r
+ static __device__ __forceinline__ float abs(float a)\r
+ {\r
+ return ::fabsf(a);\r
+ }\r
+ static __device__ __forceinline__ double abs(double a)\r
+ {\r
+ return ::fabs(a);\r
+ }\r
\r
-template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
- Divide<T, D> op(scale);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-template <typename T, typename D> struct DivideScalar : unary_function<T, D>\r
-{\r
- DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
- __device__ __forceinline__ D operator ()(T a) const\r
+ __device__ __forceinline__ T operator ()(T a, T b) const\r
+ {\r
+ return saturate_cast<T>(::abs(a - b));\r
+ }\r
+ };\r
+\r
+ template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >\r
{\r
- return saturate_cast<D>(scale * a / val);\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);\r
}\r
- const double val;\r
- const double scale;\r
-};\r
\r
-template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ //template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void absdiff_gpu<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ //template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
-template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
- DivideScalar<T, D> op(val, scale);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<float, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-//template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, int >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-//template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-template <typename T, typename D> struct Reciprocal : unary_function<T, D>\r
-{\r
- Reciprocal(double scale_) : scale(scale_) {}\r
- __device__ __forceinline__ D operator ()(T a) const\r
+ template <typename T> struct AbsdiffScalar : unary_function<T, T>\r
+ {\r
+ AbsdiffScalar(double val_) : val(val_) {}\r
+ __device__ __forceinline__ T operator ()(T a) const\r
+ {\r
+ return saturate_cast<T>(::fabs(a - val));\r
+ }\r
+ double val;\r
+ };\r
+\r
+ template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >\r
{\r
- return a != 0 ? saturate_cast<D>(scale / a) : 0;\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+\r
+ template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+ AbsdiffScalar<T> op(val);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);\r
}\r
- const double scale;\r
-};\r
\r
-template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
+ template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void absdiff_gpu<int >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
+ //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
+ template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
-template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&scalar) );\r
- Reciprocal<T, D> op(scalar);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<uchar, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<ushort, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<short, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<int, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<float, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<double, int >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// absdiff\r
-\r
-template <typename T> struct Absdiff : binary_function<T, T, T>\r
-{\r
- static __device__ __forceinline__ int abs(int a)\r
+ //////////////////////////////////////////////////////////////////////////////////////\r
+ // Compare\r
+\r
+ template <typename T> struct Equal : binary_function<T, T, uchar>\r
+ {\r
+ __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ {\r
+ return static_cast<uchar>((src1 == src2) * 255);\r
+ }\r
+ };\r
+ template <typename T> struct NotEqual : binary_function<T, T, uchar>\r
+ {\r
+ __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ {\r
+ return static_cast<uchar>((src1 != src2) * 255);\r
+ }\r
+ };\r
+ template <typename T> struct Less : binary_function<T, T, uchar>\r
+ {\r
+ __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ {\r
+ return static_cast<uchar>((src1 < src2) * 255);\r
+ }\r
+ };\r
+ template <typename T> struct LessEqual : binary_function<T, T, uchar>\r
+ {\r
+ __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ {\r
+ return static_cast<uchar>((src1 <= src2) * 255);\r
+ }\r
+ };\r
+\r
+ template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >\r
{\r
- return ::abs(a);\r
- }\r
- static __device__ __forceinline__ float abs(float a)\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >\r
{\r
- return ::fabsf(a);\r
- }\r
- static __device__ __forceinline__ double abs(double a)\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >\r
{\r
- return ::fabs(a);\r
- }\r
-\r
- __device__ __forceinline__ T operator ()(T a, T b) const\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >\r
{\r
- return saturate_cast<T>(::abs(a - b));\r
- }\r
-};\r
-\r
-template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
-template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);\r
-}\r
-\r
-//template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void absdiff_gpu<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-//template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template <typename T> struct AbsdiffScalar : unary_function<T, T>\r
-{\r
- AbsdiffScalar(double val_) : val(val_) {}\r
- __device__ __forceinline__ T operator ()(T a) const\r
+ template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return saturate_cast<T>(::fabs(a - val));\r
+ Op<T> op;\r
+ ::cv::gpu::device::transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);\r
}\r
- double val;\r
-};\r
-\r
-template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
\r
-template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
- AbsdiffScalar<T> op(val);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);\r
-}\r
-\r
-template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
-template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void absdiff_gpu<int >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
-//template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); \r
-template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////\r
-// Compare\r
-\r
-template <typename T> struct Equal : binary_function<T, T, uchar>\r
-{\r
- __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return static_cast<uchar>((src1 == src2) * 255);\r
+ compare<Equal, T>(src1, src2, dst, stream);\r
}\r
-};\r
-template <typename T> struct NotEqual : binary_function<T, T, uchar>\r
-{\r
- __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return static_cast<uchar>((src1 != src2) * 255);\r
+ compare<NotEqual, T>(src1, src2, dst, stream);\r
}\r
-};\r
-template <typename T> struct Less : binary_function<T, T, uchar>\r
-{\r
- __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return static_cast<uchar>((src1 < src2) * 255);\r
+ compare<Less, T>(src1, src2, dst, stream);\r
}\r
-};\r
-template <typename T> struct LessEqual : binary_function<T, T, uchar>\r
-{\r
- __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
+ template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
{\r
- return static_cast<uchar>((src1 <= src2) * 255);\r
+ compare<LessEqual, T>(src1, src2, dst, stream);\r
}\r
-};\r
-\r
-template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
\r
-template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- Op<T> op;\r
- OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);\r
-}\r
+ template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // Unary bitwise logical matrix operations\r
+\r
+ enum { UN_OP_NOT };\r
+\r
+ template <typename T, int opid>\r
+ struct UnOp;\r
+\r
+ template <typename T>\r
+ struct UnOp<T, UN_OP_NOT>\r
+ { \r
+ static __device__ __forceinline__ T call(T v) { return ~v; }\r
+ };\r
\r
-template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- compare<Equal, T>(src1, src2, dst, stream);\r
-}\r
-template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- compare<NotEqual, T>(src1, src2, dst, stream);\r
-}\r
-template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- compare<Less, T>(src1, src2, dst, stream);\r
-}\r
-template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- compare<LessEqual, T>(src1, src2, dst, stream);\r
-}\r
-\r
-template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<int >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// Unary bitwise logical matrix operations\r
-\r
-enum { UN_OP_NOT };\r
-\r
-template <typename T, int opid>\r
-struct UnOp;\r
-\r
-template <typename T>\r
-struct UnOp<T, UN_OP_NOT>\r
-{ \r
- static __device__ __forceinline__ T call(T v) { return ~v; }\r
-};\r
-\r
-\r
-template <int opid>\r
-__global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)\r
-{\r
- const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (y < rows) \r
+ template <int opid>\r
+ __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)\r
{\r
- uchar* dst_ptr = dst.ptr(y) + x;\r
- const uchar* src_ptr = src.ptr(y) + x;\r
- if (x + sizeof(uint) - 1 < width)\r
- {\r
- *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);\r
- }\r
- else\r
+ const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (y < rows) \r
{\r
- const uchar* src_end = src.ptr(y) + width;\r
- while (src_ptr < src_end)\r
+ uchar* dst_ptr = dst.ptr(y) + x;\r
+ const uchar* src_ptr = src.ptr(y) + x;\r
+ if (x + sizeof(uint) - 1 < width)\r
{\r
- *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);\r
+ *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);\r
+ }\r
+ else\r
+ {\r
+ const uchar* src_end = src.ptr(y) + width;\r
+ while (src_ptr < src_end)\r
+ {\r
+ *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);\r
+ }\r
}\r
}\r
}\r
-}\r
\r
\r
-template <int opid>\r
-void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, \r
- cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(width, threads.x * sizeof(uint)), \r
- divUp(rows, threads.y));\r
-\r
- bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <int opid>\r
+ void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, \r
+ cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(width, threads.x * sizeof(uint)), \r
+ divUp(rows, threads.y));\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ if (stream == 0) \r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T, int opid>\r
-__global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, \r
- const PtrStepb mask, PtrStepb dst)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+ template <typename T, int opid>\r
+ __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, \r
+ const PtrStepb mask, PtrStepb dst)\r
{\r
- T* dst_row = (T*)dst.ptr(y);\r
- const T* src_row = (const T*)src.ptr(y);\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- dst_row[x] = UnOp<T, opid>::call(src_row[x]);\r
- }\r
-}\r
+ if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+ {\r
+ T* dst_row = (T*)dst.ptr(y);\r
+ const T* src_row = (const T*)src.ptr(y);\r
\r
+ dst_row[x] = UnOp<T, opid>::call(src_row[x]);\r
+ }\r
+ }\r
\r
-template <typename T, int opid>\r
-void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); \r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T, int opid>\r
+ void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); \r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ if (stream == 0) \r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, \r
- const PtrStepb src, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);\r
-}\r
\r
+ void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, \r
+ const PtrStepb src, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);\r
+ }\r
\r
-template <typename T>\r
-void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);\r
-}\r
\r
-template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template <typename T>\r
+ void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);\r
+ }\r
\r
+ template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// Binary bitwise logical matrix operations\r
\r
-enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // Binary bitwise logical matrix operations\r
\r
-template <typename T, int opid>\r
-struct BinOp;\r
+ enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };\r
\r
-template <typename T>\r
-struct BinOp<T, BIN_OP_OR>\r
-{ \r
- static __device__ __forceinline__ T call(T a, T b) { return a | b; } \r
-};\r
+ template <typename T, int opid>\r
+ struct BinOp;\r
\r
+ template <typename T>\r
+ struct BinOp<T, BIN_OP_OR>\r
+ { \r
+ static __device__ __forceinline__ T call(T a, T b) { return a | b; } \r
+ };\r
\r
-template <typename T>\r
-struct BinOp<T, BIN_OP_AND>\r
-{ \r
- static __device__ __forceinline__ T call(T a, T b) { return a & b; } \r
-};\r
\r
-template <typename T>\r
-struct BinOp<T, BIN_OP_XOR>\r
-{ \r
- static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } \r
-};\r
+ template <typename T>\r
+ struct BinOp<T, BIN_OP_AND>\r
+ { \r
+ static __device__ __forceinline__ T call(T a, T b) { return a & b; } \r
+ };\r
\r
+ template <typename T>\r
+ struct BinOp<T, BIN_OP_XOR>\r
+ { \r
+ static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } \r
+ };\r
\r
-template <int opid>\r
-__global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, \r
- const PtrStepb src2, PtrStepb dst)\r
-{\r
- const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (y < rows) \r
+ template <int opid>\r
+ __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, \r
+ const PtrStepb src2, PtrStepb dst)\r
{\r
- uchar* dst_ptr = dst.ptr(y) + x;\r
- const uchar* src1_ptr = src1.ptr(y) + x;\r
- const uchar* src2_ptr = src2.ptr(y) + x;\r
+ const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (x + sizeof(uint) - 1 < width)\r
+ if (y < rows) \r
{\r
- *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);\r
- }\r
- else\r
- {\r
- const uchar* src1_end = src1.ptr(y) + width;\r
- while (src1_ptr < src1_end)\r
+ uchar* dst_ptr = dst.ptr(y) + x;\r
+ const uchar* src1_ptr = src1.ptr(y) + x;\r
+ const uchar* src2_ptr = src2.ptr(y) + x;\r
+\r
+ if (x + sizeof(uint) - 1 < width)\r
{\r
- *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);\r
+ *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);\r
+ }\r
+ else\r
+ {\r
+ const uchar* src1_end = src1.ptr(y) + width;\r
+ while (src1_ptr < src1_end)\r
+ {\r
+ *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);\r
+ }\r
}\r
}\r
}\r
-}\r
\r
\r
-template <int opid>\r
-void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, \r
- PtrStepb dst, cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));\r
-\r
- bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <int opid>\r
+ void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, \r
+ PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ if (stream == 0) \r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T, int opid>\r
-__global__ void bitwiseBinOpKernel(\r
- int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
- const PtrStepb mask, PtrStepb dst)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+ template <typename T, int opid>\r
+ __global__ void bitwiseBinOpKernel(\r
+ int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+ const PtrStepb mask, PtrStepb dst)\r
{\r
- T* dst_row = (T*)dst.ptr(y);\r
- const T* src1_row = (const T*)src1.ptr(y);\r
- const T* src2_row = (const T*)src2.ptr(y);\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+ {\r
+ T* dst_row = (T*)dst.ptr(y);\r
+ const T* src1_row = (const T*)src1.ptr(y);\r
+ const T* src2_row = (const T*)src2.ptr(y);\r
\r
- dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);\r
+ dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);\r
+ }\r
}\r
-}\r
\r
\r
-template <typename T, int opid>\r
-void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- dim3 threads(16, 16);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ template <typename T, int opid>\r
+ void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(16, 16);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0) \r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
\r
-void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-}\r
+ void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+ const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+ }\r
\r
\r
-template <typename T>\r
-void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-}\r
+ template <typename T>\r
+ void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+ }\r
\r
-template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
\r
-void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-}\r
+ void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+ const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+ }\r
\r
\r
-template <typename T>\r
-void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-}\r
+ template <typename T>\r
+ void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+ }\r
\r
-template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
\r
-void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-}\r
+ void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+ const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+ }\r
\r
\r
-template <typename T>\r
-void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-{\r
- bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-}\r
+ template <typename T>\r
+ void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+ const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+ {\r
+ bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+ }\r
\r
-template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+ template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // min/max\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// min/max\r
+ namespace detail\r
+ {\r
+ template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>\r
+ {\r
+ };\r
+ template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_block_dim_y = 4 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ }\r
\r
-namespace detail\r
-{\r
- template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>\r
+ template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >\r
{\r
};\r
- template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>\r
+ template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >\r
{\r
- enum { smart_shift = 4 };\r
};\r
- template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>\r
+ template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >\r
+ {\r
+ };\r
+ template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >\r
{\r
- enum { smart_block_dim_y = 4 };\r
- enum { smart_shift = 4 };\r
};\r
-}\r
\r
-template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >\r
-{\r
-};\r
+ template <typename T>\r
+ void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(src1, src2, dst, minimum<T>(), stream); \r
+ }\r
\r
-template <typename T>\r
-void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, minimum<T>(), stream); \r
-}\r
-\r
-template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, maximum<T>(), stream); \r
-}\r
-\r
-template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream); \r
-}\r
-\r
-template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-template void min_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream); \r
-}\r
+ template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+ template <typename T>\r
+ void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(src1, src2, dst, maximum<T>(), stream); \r
+ }\r
\r
-template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-template void max_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+ template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+ template <typename T>\r
+ void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream); \r
+ }\r
\r
+ template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void min_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// threshold\r
+ template <typename T>\r
+ void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream); \r
+ }\r
\r
-namespace detail\r
-{\r
- template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>\r
+ template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+ template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+ template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+ template void max_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+ template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+ template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // threshold\r
+\r
+ namespace detail\r
+ {\r
+ template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>\r
+ {\r
+ };\r
+ template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_block_dim_y = 4 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ }\r
+\r
+ template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >\r
{\r
};\r
- template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>\r
+ template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >\r
{\r
- enum { smart_shift = 4 };\r
};\r
- template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>\r
+ template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >\r
+ {\r
+ };\r
+ template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >\r
+ {\r
+ };\r
+ template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >\r
{\r
- enum { smart_block_dim_y = 4 };\r
- enum { smart_shift = 4 };\r
};\r
-}\r
\r
-template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >\r
-{\r
-};\r
-template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >\r
-{\r
-};\r
+ template <template <typename> class Op, typename T>\r
+ void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
+ cudaStream_t stream)\r
+ {\r
+ Op<T> op(thresh, maxVal);\r
+ ::cv::gpu::device::transform(src, dst, op, stream);\r
+ }\r
\r
-template <template <typename> class Op, typename T>\r
-void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
- cudaStream_t stream)\r
-{\r
- Op<T> op(thresh, maxVal);\r
- OPENCV_DEVICE_NAMESPACE_ transform(src, dst, op, stream);\r
-}\r
+ template <typename T>\r
+ void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,\r
+ cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
+ cudaStream_t stream);\r
\r
-template <typename T>\r
-void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,\r
- cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
- cudaStream_t stream);\r
+ static const caller_t callers[] = \r
+ {\r
+ threshold_caller<thresh_binary_func, T>, \r
+ threshold_caller<thresh_binary_inv_func, T>, \r
+ threshold_caller<thresh_trunc_func, T>, \r
+ threshold_caller<thresh_to_zero_func, T>, \r
+ threshold_caller<thresh_to_zero_inv_func, T>\r
+ };\r
+\r
+ callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);\r
+ }\r
\r
- static const caller_t callers[] = \r
- {\r
- threshold_caller<thresh_binary_func, T>, \r
- threshold_caller<thresh_binary_inv_func, T>, \r
- threshold_caller<thresh_trunc_func, T>, \r
- threshold_caller<thresh_to_zero_func, T>, \r
- threshold_caller<thresh_to_zero_inv_func, T>\r
+ template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
+ template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // pow\r
+\r
+ template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>\r
+ { \r
+ float power;\r
+ PowOp(float power_) : power(power_) {}\r
+ \r
+ __device__ __forceinline__ T operator()(const T& e) const\r
+ { \r
+ return saturate_cast<T>(__powf((float)e, power));\r
+ } \r
};\r
\r
- callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);\r
-}\r
+ template<typename T> struct PowOp<T, true> : unary_function<T, T>\r
+ {\r
+ float power;\r
+ PowOp(float power_) : power(power_) {}\r
+\r
+ __device__ __forceinline__ float operator()(const T& e) const\r
+ {\r
+ T res = saturate_cast<T>(__powf((float)e, power)); \r
+ \r
+ if ( (e < 0) && (1 & (int)power) )\r
+ res *= -1; \r
+ return res; \r
+ }\r
+ };\r
\r
-template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
-template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);\r
+ template<> struct PowOp<float> : unary_function<float, float>\r
+ {\r
+ float power;\r
+ PowOp(float power_) : power(power_) {}\r
\r
+ __device__ __forceinline__ float operator()(const float& e) const\r
+ {\r
+ return __powf(::fabs(e), power);\r
+ }\r
+ };\r
\r
+ namespace detail\r
+ {\r
+ template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >\r
+ {\r
+ };\r
+ template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
+ template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ {\r
+ enum { smart_block_dim_y = 4 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ }\r
\r
+ template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>\r
+ {\r
+ };\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// pow\r
+ template<typename T>\r
+ void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)\r
+ {\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);\r
+ } \r
\r
-template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>\r
-{ \r
- float power;\r
- PowOp(float power_) : power(power_) {}\r
- \r
- __device__ __forceinline__ T operator()(const T& e) const\r
- { \r
- return saturate_cast<T>(__powf((float)e, power));\r
- } \r
-};\r
+ template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+ template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+ template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+ template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+ template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+ template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
\r
-template<typename T> struct PowOp<T, true> : unary_function<T, T>\r
-{\r
- float power;\r
- PowOp(float power_) : power(power_) {}\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // addWeighted\r
\r
- __device__ __forceinline__ float operator()(const T& e) const\r
+ template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>\r
{\r
- T res = saturate_cast<T>(__powf((float)e, power)); \r
- \r
- if ( (e < 0) && (1 & (int)power) )\r
- res *= -1; \r
- return res; \r
- }\r
-};\r
+ __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}\r
\r
-template<> struct PowOp<float> : unary_function<float, float>\r
-{\r
- float power;\r
- PowOp(float power_) : power(power_) {}\r
+ __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const\r
+ {\r
+ return saturate_cast<D>(alpha * a + beta * b + gamma);\r
+ }\r
+\r
+ const double alpha;\r
+ const double beta;\r
+ const double gamma;\r
+ };\r
\r
- __device__ __forceinline__ float operator()(const float& e) const\r
+ template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >\r
{\r
- return __powf(::fabs(e), power);\r
- }\r
-};\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
-namespace detail\r
-{\r
- template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >\r
+ template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >\r
{\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
};\r
- template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >\r
{\r
enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 8 };\r
+ enum { smart_shift = 4 };\r
};\r
- template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >\r
{\r
+ enum { smart_block_dim_y = 8 };\r
enum { smart_shift = 4 };\r
};\r
- template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
+ template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >\r
{\r
- enum { smart_block_dim_y = 4 };\r
+ enum { smart_block_dim_y = 8 };\r
enum { smart_shift = 4 };\r
};\r
-}\r
-\r
-template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>\r
-{\r
-};\r
-\r
-template<typename T>\r
-void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)\r
-{\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);\r
-} \r
-\r
-template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-\r
-\r
-\r
-\r
-//////////////////////////////////////////////////////////////////////////\r
-// addWeighted\r
-\r
-template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>\r
-{\r
- __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}\r
-\r
- __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const\r
+ template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
{\r
- return saturate_cast<D>(alpha * a + beta * b + gamma);\r
- }\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
- const double alpha;\r
- const double beta;\r
- const double gamma;\r
-};\r
+ template <typename T1, typename T2, typename D>\r
+ void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&gamma) );\r
\r
-template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >\r
-{\r
- enum { smart_shift = 4 };\r
-};\r
+ AddWeighted<T1, T2, D> op(alpha, beta, gamma);\r
\r
-template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
-template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
-{\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
-};\r
+ ::cv::gpu::device::transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);\r
+ }\r
\r
-template <typename T1, typename T2, typename D>\r
-void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
- cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
- cudaSafeCall( cudaSetDoubleForDevice(&gamma) );\r
-\r
- AddWeighted<T1, T2, D> op(alpha, beta, gamma);\r
-\r
- OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);\r
-}\r
-\r
-template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+ template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+ template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/utility.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#define UINT_BITS 32U\r
-\r
-//Warps == subhistograms per threadblock\r
-#define WARP_COUNT 6\r
-\r
-//Threadblock size\r
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)\r
-#define HISTOGRAM256_BIN_COUNT 256\r
-\r
-//Shared memory per threadblock\r
-#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)\r
-\r
-#define PARTIAL_HISTOGRAM256_COUNT 240\r
-\r
-#define MERGE_THREADBLOCK_SIZE 256\r
-\r
-#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ #define UINT_BITS 32U\r
\r
-namespace hist {\r
+ //Warps == subhistograms per threadblock\r
+ #define WARP_COUNT 6\r
\r
-#if (!USE_SMEM_ATOMICS)\r
+ //Threadblock size\r
+ #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)\r
+ #define HISTOGRAM256_BIN_COUNT 256\r
\r
- #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )\r
+ //Shared memory per threadblock\r
+ #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)\r
\r
- __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)\r
- {\r
- uint count;\r
- do\r
- {\r
- count = s_WarpHist[data] & TAG_MASK;\r
- count = threadTag | (count + 1);\r
- s_WarpHist[data] = count;\r
- } while (s_WarpHist[data] != count);\r
- }\r
+ #define PARTIAL_HISTOGRAM256_COUNT 240\r
\r
-#else\r
+ #define MERGE_THREADBLOCK_SIZE 256\r
\r
- #define TAG_MASK 0xFFFFFFFFU\r
+ #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)\r
\r
- __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)\r
+ namespace hist \r
{\r
- atomicAdd(s_WarpHist + data, 1);\r
- }\r
-\r
-#endif\r
-\r
-__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)\r
-{\r
- uint x = pos_x << 2;\r
-\r
- if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);\r
- if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);\r
- if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);\r
- if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);\r
-}\r
-\r
-__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)\r
-{\r
- //Per-warp subhistogram storage\r
- __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];\r
- uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;\r
+ #if (!USE_SMEM_ATOMICS)\r
\r
- //Clear shared memory storage for current threadblock before processing\r
- #pragma unroll\r
- for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)\r
- s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;\r
+ #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )\r
\r
- //Cycle through the entire data set, update subhistograms for each warp\r
- const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);\r
+ __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)\r
+ {\r
+ uint count;\r
+ do\r
+ {\r
+ count = s_WarpHist[data] & TAG_MASK;\r
+ count = threadTag | (count + 1);\r
+ s_WarpHist[data] = count;\r
+ } while (s_WarpHist[data] != count);\r
+ }\r
\r
- __syncthreads();\r
- const uint colsui = d_Data.step / sizeof(uint);\r
- for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)\r
- {\r
- uint pos_y = pos / colsui;\r
- uint pos_x = pos % colsui;\r
- uint data = d_Data.ptr(pos_y)[pos_x];\r
- addWord(s_WarpHist, data, tag, pos_x, cols);\r
- }\r
-\r
- //Merge per-warp histograms into per-block and write to global memory\r
- __syncthreads();\r
- for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)\r
- {\r
- uint sum = 0;\r
-\r
- for (uint i = 0; i < WARP_COUNT; i++)\r
- sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;\r
-\r
- d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;\r
- }\r
-}\r
+ #else\r
\r
-////////////////////////////////////////////////////////////////////////////////\r
-// Merge histogram256() output\r
-// Run one threadblock per bin; each threadblock adds up the same bin counter\r
-// from every partial histogram. Reads are uncoalesced, but mergeHistogram256\r
-// takes only a fraction of total processing time\r
-////////////////////////////////////////////////////////////////////////////////\r
+ #define TAG_MASK 0xFFFFFFFFU\r
\r
-__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)\r
-{\r
- uint sum = 0;\r
+ __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)\r
+ {\r
+ atomicAdd(s_WarpHist + data, 1);\r
+ }\r
\r
- #pragma unroll\r
- for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)\r
- sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];\r
+ #endif\r
\r
- __shared__ uint data[MERGE_THREADBLOCK_SIZE];\r
- data[threadIdx.x] = sum;\r
+ __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)\r
+ {\r
+ uint x = pos_x << 2;\r
\r
- for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)\r
- {\r
- __syncthreads();\r
- if(threadIdx.x < stride)\r
- data[threadIdx.x] += data[threadIdx.x + stride];\r
- }\r
+ if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);\r
+ if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);\r
+ if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);\r
+ if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);\r
+ }\r
\r
- if(threadIdx.x == 0)\r
- d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);\r
-}\r
+ __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)\r
+ {\r
+ //Per-warp subhistogram storage\r
+ __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];\r
+ uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;\r
+\r
+ //Clear shared memory storage for current threadblock before processing\r
+ #pragma unroll\r
+ for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)\r
+ s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;\r
+\r
+ //Cycle through the entire data set, update subhistograms for each warp\r
+ const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);\r
+\r
+ __syncthreads();\r
+ const uint colsui = d_Data.step / sizeof(uint);\r
+ for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)\r
+ {\r
+ uint pos_y = pos / colsui;\r
+ uint pos_x = pos % colsui;\r
+ uint data = d_Data.ptr(pos_y)[pos_x];\r
+ addWord(s_WarpHist, data, tag, pos_x, cols);\r
+ }\r
+\r
+ //Merge per-warp histograms into per-block and write to global memory\r
+ __syncthreads();\r
+ for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)\r
+ {\r
+ uint sum = 0;\r
+\r
+ for (uint i = 0; i < WARP_COUNT; i++)\r
+ sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;\r
+\r
+ d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;\r
+ }\r
+ }\r
+\r
+ ////////////////////////////////////////////////////////////////////////////////\r
+ // Merge histogram256() output\r
+ // Run one threadblock per bin; each threadblock adds up the same bin counter\r
+ // from every partial histogram. Reads are uncoalesced, but mergeHistogram256\r
+ // takes only a fraction of total processing time\r
+ ////////////////////////////////////////////////////////////////////////////////\r
+\r
+ __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)\r
+ {\r
+ uint sum = 0;\r
\r
-void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)\r
-{\r
- histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
- DevMem2D_<uint>(src),\r
- buf, \r
- static_cast<uint>(src.rows * src.step / sizeof(uint)),\r
- src.cols);\r
+ #pragma unroll\r
+ for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)\r
+ sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ __shared__ uint data[MERGE_THREADBLOCK_SIZE];\r
+ data[threadIdx.x] = sum;\r
\r
- mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);\r
+ for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)\r
+ {\r
+ __syncthreads();\r
+ if(threadIdx.x < stride)\r
+ data[threadIdx.x] += data[threadIdx.x + stride];\r
+ }\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ if(threadIdx.x == 0)\r
+ d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)\r
+ {\r
+ histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
+ DevMem2D_<uint>(src),\r
+ buf, \r
+ static_cast<uint>(src.rows * src.step / sizeof(uint)),\r
+ src.cols);\r
\r
-__constant__ int c_lut[256];\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);\r
\r
- if (x < src.cols && y < src.rows)\r
- {\r
- const uchar val = src.ptr(y)[x];\r
- const int lut = c_lut[val];\r
- dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);\r
- }\r
-}\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)\r
-{\r
- dim3 block(16, 16);\r
- dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );\r
+ __constant__ int c_lut[256];\r
\r
- equalizeHist<<<grid, block, 0, stream>>>(src, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < src.cols && y < src.rows)\r
+ {\r
+ const uchar val = src.ptr(y)[x];\r
+ const int lut = c_lut[val];\r
+ dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);\r
+ }\r
+ }\r
+\r
+ void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)\r
+ {\r
+ dim3 block(16, 16);\r
+ dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );\r
\r
-} // namespace hist\r
+ equalizeHist<<<grid, block, 0, stream>>>(src, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ } // namespace hist\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-// Other values are not supported\r
-#define CELL_WIDTH 8\r
-#define CELL_HEIGHT 8\r
-#define CELLS_PER_BLOCK_X 2\r
-#define CELLS_PER_BLOCK_Y 2\r
-\r
-namespace hog {\r
-\r
-__constant__ int cnbins;\r
-__constant__ int cblock_stride_x;\r
-__constant__ int cblock_stride_y;\r
-__constant__ int cnblocks_win_x;\r
-__constant__ int cnblocks_win_y;\r
-__constant__ int cblock_hist_size;\r
-__constant__ int cblock_hist_size_2up;\r
-__constant__ int cdescr_size;\r
-__constant__ int cdescr_width;\r
-\r
-\r
-/* Returns the nearest upper power of two, works only for \r
-the typical GPU thread count (pert block) values */\r
-int power_2up(unsigned int n)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- if (n < 1) return 1;\r
- else if (n < 2) return 2;\r
- else if (n < 4) return 4;\r
- else if (n < 8) return 8;\r
- else if (n < 16) return 16;\r
- else if (n < 32) return 32;\r
- else if (n < 64) return 64;\r
- else if (n < 128) return 128;\r
- else if (n < 256) return 256;\r
- else if (n < 512) return 512;\r
- else if (n < 1024) return 1024;\r
- return -1; // Input is too big\r
-}\r
-\r
-\r
-void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
- int nblocks_win_x, int nblocks_win_y)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); \r
- cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); \r
-\r
- int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); \r
-\r
- int block_hist_size_2up = power_2up(block_hist_size); \r
- cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );\r
-\r
- int descr_width = nblocks_win_x * block_hist_size;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );\r
-\r
- int descr_size = descr_width * nblocks_win_y;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );\r
-}\r
-\r
-\r
-//----------------------------------------------------------------------------\r
-// Histogram computation\r
-\r
-\r
-template <int nblocks> // Number of histogram blocks processed by single GPU thread block\r
-__global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, \r
- const PtrElemStep qangle, float scale, float* block_hists)\r
-{\r
- const int block_x = threadIdx.z;\r
- const int cell_x = threadIdx.x / 16;\r
- const int cell_y = threadIdx.y;\r
- const int cell_thread_x = threadIdx.x & 0xF;\r
-\r
- if (blockIdx.x * blockDim.z + block_x >= img_block_width)\r
- return;\r
-\r
- extern __shared__ float smem[];\r
- float* hists = smem;\r
- float* final_hist = smem + cnbins * 48 * nblocks;\r
-\r
- const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + \r
- 4 * cell_x + cell_thread_x;\r
- const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;\r
+ // Other values are not supported\r
+ #define CELL_WIDTH 8\r
+ #define CELL_HEIGHT 8\r
+ #define CELLS_PER_BLOCK_X 2\r
+ #define CELLS_PER_BLOCK_Y 2\r
\r
- const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;\r
- const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;\r
-\r
- // 12 means that 12 pixels affect on block's cell (in one row)\r
- if (cell_thread_x < 12)\r
+ namespace hog \r
{\r
- float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + \r
- cell_x + block_x * CELLS_PER_BLOCK_X) + \r
- cell_thread_x;\r
- for (int bin_id = 0; bin_id < cnbins; ++bin_id)\r
- hist[bin_id * 48 * nblocks] = 0.f;\r
+ __constant__ int cnbins;\r
+ __constant__ int cblock_stride_x;\r
+ __constant__ int cblock_stride_y;\r
+ __constant__ int cnblocks_win_x;\r
+ __constant__ int cnblocks_win_y;\r
+ __constant__ int cblock_hist_size;\r
+ __constant__ int cblock_hist_size_2up;\r
+ __constant__ int cdescr_size;\r
+ __constant__ int cdescr_width;\r
+\r
+\r
+ /* Returns the nearest upper power of two, works only for \r
+ the typical GPU thread count (pert block) values */\r
+ int power_2up(unsigned int n)\r
+ {\r
+ if (n < 1) return 1;\r
+ else if (n < 2) return 2;\r
+ else if (n < 4) return 4;\r
+ else if (n < 8) return 8;\r
+ else if (n < 16) return 16;\r
+ else if (n < 32) return 32;\r
+ else if (n < 64) return 64;\r
+ else if (n < 128) return 128;\r
+ else if (n < 256) return 256;\r
+ else if (n < 512) return 512;\r
+ else if (n < 1024) return 1024;\r
+ return -1; // Input is too big\r
+ }\r
\r
- const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;\r
\r
- const int dist_y_begin = -4 - 4 * (int)threadIdx.y;\r
- for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)\r
+ void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
+ int nblocks_win_x, int nblocks_win_y)\r
{\r
- float2 vote = *(const float2*)grad_ptr;\r
- uchar2 bin = *(const uchar2*)qangle_ptr;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); \r
\r
- grad_ptr += grad.step;\r
- qangle_ptr += qangle.step;\r
+ int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; \r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); \r
\r
- int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);\r
- int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);\r
+ int block_hist_size_2up = power_2up(block_hist_size); \r
+ cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );\r
\r
- float gaussian = ::expf(-(dist_center_y * dist_center_y + \r
- dist_center_x * dist_center_x) * scale);\r
- float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * \r
- (8.f - ::fabs(dist_x + 0.5f)) / 64.f;\r
+ int descr_width = nblocks_win_x * block_hist_size;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );\r
\r
- hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;\r
- hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;\r
- }\r
-\r
- volatile float* hist_ = hist;\r
- for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)\r
- {\r
- if (cell_thread_x < 6) hist_[0] += hist_[6];\r
- if (cell_thread_x < 3) hist_[0] += hist_[3];\r
- if (cell_thread_x == 0) \r
- final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] \r
- = hist_[0] + hist_[1] + hist_[2];\r
+ int descr_size = descr_width * nblocks_win_y;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );\r
}\r
- }\r
\r
- __syncthreads();\r
\r
- float* block_hist = block_hists + (blockIdx.y * img_block_width + \r
- blockIdx.x * blockDim.z + block_x) * \r
- cblock_hist_size; \r
+ //----------------------------------------------------------------------------\r
+ // Histogram computation\r
\r
- int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;\r
- if (tid < cblock_hist_size)\r
- block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; \r
-}\r
\r
+ template <int nblocks> // Number of histogram blocks processed by single GPU thread block\r
+ __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, \r
+ const PtrElemStep qangle, float scale, float* block_hists)\r
+ {\r
+ const int block_x = threadIdx.z;\r
+ const int cell_x = threadIdx.x / 16;\r
+ const int cell_y = threadIdx.y;\r
+ const int cell_thread_x = threadIdx.x & 0xF;\r
+\r
+ if (blockIdx.x * blockDim.z + block_x >= img_block_width)\r
+ return;\r
+\r
+ extern __shared__ float smem[];\r
+ float* hists = smem;\r
+ float* final_hist = smem + cnbins * 48 * nblocks;\r
+\r
+ const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + \r
+ 4 * cell_x + cell_thread_x;\r
+ const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;\r
+\r
+ const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;\r
+ const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;\r
+\r
+ // 12 means that 12 pixels affect on block's cell (in one row)\r
+ if (cell_thread_x < 12)\r
+ {\r
+ float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + \r
+ cell_x + block_x * CELLS_PER_BLOCK_X) + \r
+ cell_thread_x;\r
+ for (int bin_id = 0; bin_id < cnbins; ++bin_id)\r
+ hist[bin_id * 48 * nblocks] = 0.f;\r
+\r
+ const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;\r
+\r
+ const int dist_y_begin = -4 - 4 * (int)threadIdx.y;\r
+ for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)\r
+ {\r
+ float2 vote = *(const float2*)grad_ptr;\r
+ uchar2 bin = *(const uchar2*)qangle_ptr;\r
+\r
+ grad_ptr += grad.step;\r
+ qangle_ptr += qangle.step;\r
+\r
+ int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);\r
+ int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);\r
+\r
+ float gaussian = ::expf(-(dist_center_y * dist_center_y + \r
+ dist_center_x * dist_center_x) * scale);\r
+ float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * \r
+ (8.f - ::fabs(dist_x + 0.5f)) / 64.f;\r
+\r
+ hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;\r
+ hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;\r
+ }\r
+\r
+ volatile float* hist_ = hist;\r
+ for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)\r
+ {\r
+ if (cell_thread_x < 6) hist_[0] += hist_[6];\r
+ if (cell_thread_x < 3) hist_[0] += hist_[3];\r
+ if (cell_thread_x == 0) \r
+ final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] \r
+ = hist_[0] + hist_[1] + hist_[2];\r
+ }\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ float* block_hist = block_hists + (blockIdx.y * img_block_width + \r
+ blockIdx.x * blockDim.z + block_x) * \r
+ cblock_hist_size; \r
+\r
+ int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;\r
+ if (tid < cblock_hist_size)\r
+ block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; \r
+ }\r
\r
-void compute_hists(int nbins, int block_stride_x, int block_stride_y, \r
- int height, int width, const DevMem2Df& grad, \r
- const DevMem2Db& qangle, float sigma, float* block_hists) \r
-{\r
- const int nblocks = 1;\r
-\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / \r
- block_stride_x;\r
- int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / \r
- block_stride_y;\r
-\r
- dim3 grid(divUp(img_block_width, nblocks), img_block_height);\r
- dim3 threads(32, 2, nblocks);\r
\r
- cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, \r
- cudaFuncCachePreferL1));\r
- \r
- // Precompute gaussian spatial window parameter\r
- float scale = 1.f / (2.f * sigma * sigma);\r
+ void compute_hists(int nbins, int block_stride_x, int block_stride_y, \r
+ int height, int width, const DevMem2Df& grad, \r
+ const DevMem2Db& qangle, float sigma, float* block_hists) \r
+ {\r
+ const int nblocks = 1;\r
+\r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / \r
+ block_stride_x;\r
+ int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / \r
+ block_stride_y;\r
+\r
+ dim3 grid(divUp(img_block_width, nblocks), img_block_height);\r
+ dim3 threads(32, 2, nblocks);\r
+\r
+ cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, \r
+ cudaFuncCachePreferL1));\r
+ \r
+ // Precompute gaussian spatial window parameter\r
+ float scale = 1.f / (2.f * sigma * sigma);\r
+\r
+ int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);\r
+ int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);\r
+ int smem = hists_size + final_hists_size;\r
+ compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(\r
+ img_block_width, grad, qangle, scale, block_hists);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);\r
- int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);\r
- int smem = hists_size + final_hists_size;\r
- compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(\r
- img_block_width, grad, qangle, scale, block_hists);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //-------------------------------------------------------------\r
+ // Normalization of histograms via L2Hys_norm\r
+ //\r
+\r
+\r
+ template<int size> \r
+ __device__ float reduce_smem(volatile float* smem)\r
+ { \r
+ unsigned int tid = threadIdx.x;\r
+ float sum = smem[tid];\r
+\r
+ if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }\r
+ \r
+ if (tid < 32)\r
+ { \r
+ if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];\r
+ if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];\r
+ if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];\r
+ if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];\r
+ if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];\r
+ if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];\r
+ }\r
+\r
+ __syncthreads();\r
+ sum = smem[0];\r
+ \r
+ return sum;\r
+ }\r
\r
\r
-//-------------------------------------------------------------\r
-// Normalization of histograms via L2Hys_norm\r
-//\r
+ template <int nthreads, // Number of threads which process one block historgam \r
+ int nblocks> // Number of block hisograms processed by one GPU thread block\r
+ __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,\r
+ const int img_block_width, \r
+ float* block_hists, float threshold)\r
+ {\r
+ if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)\r
+ return;\r
+\r
+ float* hist = block_hists + (blockIdx.y * img_block_width + \r
+ blockIdx.x * blockDim.z + threadIdx.z) * \r
+ block_hist_size + threadIdx.x;\r
+ \r
+ __shared__ float sh_squares[nthreads * nblocks];\r
+ float* squares = sh_squares + threadIdx.z * nthreads;\r
+ \r
+ float elem = 0.f;\r
+ if (threadIdx.x < block_hist_size)\r
+ elem = hist[0];\r
+ \r
+ squares[threadIdx.x] = elem * elem; \r
+\r
+ __syncthreads();\r
+ float sum = reduce_smem<nthreads>(squares);\r
+ \r
+ float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); \r
+ elem = ::min(elem * scale, threshold);\r
+ \r
+ __syncthreads();\r
+ squares[threadIdx.x] = elem * elem;\r
+\r
+ __syncthreads();\r
+ sum = reduce_smem<nthreads>(squares);\r
+ scale = 1.0f / (::sqrtf(sum) + 1e-3f);\r
+ \r
+ if (threadIdx.x < block_hist_size)\r
+ hist[0] = elem * scale;\r
+ }\r
\r
\r
-template<int size> \r
-__device__ float reduce_smem(volatile float* smem)\r
-{ \r
- unsigned int tid = threadIdx.x;\r
- float sum = smem[tid];\r
-\r
- if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }\r
- if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }\r
- if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }\r
- \r
- if (tid < 32)\r
- { \r
- if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];\r
- if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];\r
- if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];\r
- if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];\r
- if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];\r
- if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];\r
- }\r
-\r
- __syncthreads();\r
- sum = smem[0];\r
- \r
- return sum;\r
-}\r
-\r
-\r
-template <int nthreads, // Number of threads which process one block historgam \r
- int nblocks> // Number of block hisograms processed by one GPU thread block\r
-__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,\r
- const int img_block_width, \r
- float* block_hists, float threshold)\r
-{\r
- if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)\r
- return;\r
-\r
- float* hist = block_hists + (blockIdx.y * img_block_width + \r
- blockIdx.x * blockDim.z + threadIdx.z) * \r
- block_hist_size + threadIdx.x;\r
- \r
- __shared__ float sh_squares[nthreads * nblocks];\r
- float* squares = sh_squares + threadIdx.z * nthreads;\r
- \r
- float elem = 0.f;\r
- if (threadIdx.x < block_hist_size)\r
- elem = hist[0];\r
- \r
- squares[threadIdx.x] = elem * elem; \r
-\r
- __syncthreads();\r
- float sum = reduce_smem<nthreads>(squares);\r
- \r
- float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); \r
- elem = ::min(elem * scale, threshold);\r
- \r
- __syncthreads();\r
- squares[threadIdx.x] = elem * elem;\r
-\r
- __syncthreads();\r
- sum = reduce_smem<nthreads>(squares);\r
- scale = 1.0f / (::sqrtf(sum) + 1e-3f);\r
- \r
- if (threadIdx.x < block_hist_size)\r
- hist[0] = elem * scale;\r
-}\r
-\r
-\r
-void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
- int height, int width, float* block_hists, float threshold)\r
-{ \r
- const int nblocks = 1;\r
-\r
- int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;\r
- int nthreads = power_2up(block_hist_size);\r
- dim3 threads(nthreads, 1, nblocks);\r
-\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
- int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;\r
- dim3 grid(divUp(img_block_width, nblocks), img_block_height);\r
-\r
- if (nthreads == 32)\r
- normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
- else if (nthreads == 64)\r
- normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
- else if (nthreads == 128)\r
- normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
- else if (nthreads == 256)\r
- normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
- else if (nthreads == 512)\r
- normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
- else\r
- cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);\r
-\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-\r
-//---------------------------------------------------------------------\r
-// Linear SVM based classification\r
-//\r
+ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
+ int height, int width, float* block_hists, float threshold)\r
+ { \r
+ const int nblocks = 1;\r
+\r
+ int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;\r
+ int nthreads = power_2up(block_hist_size);\r
+ dim3 threads(nthreads, 1, nblocks);\r
+\r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
+ int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;\r
+ dim3 grid(divUp(img_block_width, nblocks), img_block_height);\r
+\r
+ if (nthreads == 32)\r
+ normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
+ else if (nthreads == 64)\r
+ normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
+ else if (nthreads == 128)\r
+ normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
+ else if (nthreads == 256)\r
+ normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
+ else if (nthreads == 512)\r
+ normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);\r
+ else\r
+ cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);\r
\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <int nthreads, // Number of threads per one histogram block \r
- int nblocks> // Number of histogram block processed by single GPU thread block\r
-__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, \r
- const int win_block_stride_x, const int win_block_stride_y,\r
- const float* block_hists, const float* coefs,\r
- float free_coef, float threshold, unsigned char* labels)\r
-{ \r
- const int win_x = threadIdx.z;\r
- if (blockIdx.x * blockDim.z + win_x >= img_win_width)\r
- return;\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
- blockIdx.x * win_block_stride_x * blockDim.z + win_x) * \r
- cblock_hist_size;\r
\r
- float product = 0.f;\r
- for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
- {\r
- int offset_y = i / cdescr_width;\r
- int offset_x = i - offset_y * cdescr_width;\r
- product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];\r
- }\r
-\r
- __shared__ float products[nthreads * nblocks];\r
-\r
- const int tid = threadIdx.z * nthreads + threadIdx.x;\r
- products[tid] = product;\r
-\r
- __syncthreads();\r
-\r
- if (nthreads >= 512) \r
- { \r
- if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];\r
- __syncthreads(); \r
- }\r
- if (nthreads >= 256) \r
- { \r
- if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; \r
- __syncthreads(); \r
- }\r
- if (nthreads >= 128) \r
- { \r
- if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; \r
- __syncthreads(); \r
- }\r
- \r
- if (threadIdx.x < 32)\r
- { \r
- volatile float* smem = products;\r
- if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];\r
- if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];\r
- if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];\r
- if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];\r
- if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];\r
- if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];\r
- }\r
-\r
- if (threadIdx.x == 0)\r
- labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);\r
-}\r
-\r
-\r
-void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
- float* coefs, float free_coef, float threshold, unsigned char* labels)\r
-{ \r
- const int nthreads = 256;\r
- const int nblocks = 1;\r
-\r
- int win_block_stride_x = win_stride_x / block_stride_x;\r
- int win_block_stride_y = win_stride_y / block_stride_y;\r
- int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
- int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
-\r
- dim3 threads(nthreads, 1, nblocks);\r
- dim3 grid(divUp(img_win_width, nblocks), img_win_height);\r
-\r
- cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));\r
-\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
- classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(\r
- img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, \r
- block_hists, coefs, free_coef, threshold, labels);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-//----------------------------------------------------------------------------\r
-// Extract descriptors\r
-\r
-\r
-template <int nthreads>\r
-__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, \r
- const float* block_hists, PtrElemStepf descriptors)\r
-{\r
- // Get left top corner of the window in src\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
- blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
+ //---------------------------------------------------------------------\r
+ // Linear SVM based classification\r
+ //\r
+\r
+\r
+ template <int nthreads, // Number of threads per one histogram block \r
+ int nblocks> // Number of histogram block processed by single GPU thread block\r
+ __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, \r
+ const int win_block_stride_x, const int win_block_stride_y,\r
+ const float* block_hists, const float* coefs,\r
+ float free_coef, float threshold, unsigned char* labels)\r
+ { \r
+ const int win_x = threadIdx.z;\r
+ if (blockIdx.x * blockDim.z + win_x >= img_win_width)\r
+ return;\r
+\r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
+ blockIdx.x * win_block_stride_x * blockDim.z + win_x) * \r
+ cblock_hist_size;\r
+\r
+ float product = 0.f;\r
+ for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
+ {\r
+ int offset_y = i / cdescr_width;\r
+ int offset_x = i - offset_y * cdescr_width;\r
+ product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];\r
+ }\r
+\r
+ __shared__ float products[nthreads * nblocks];\r
+\r
+ const int tid = threadIdx.z * nthreads + threadIdx.x;\r
+ products[tid] = product;\r
+\r
+ __syncthreads();\r
+\r
+ if (nthreads >= 512) \r
+ { \r
+ if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];\r
+ __syncthreads(); \r
+ }\r
+ if (nthreads >= 256) \r
+ { \r
+ if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; \r
+ __syncthreads(); \r
+ }\r
+ if (nthreads >= 128) \r
+ { \r
+ if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; \r
+ __syncthreads(); \r
+ }\r
+ \r
+ if (threadIdx.x < 32)\r
+ { \r
+ volatile float* smem = products;\r
+ if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];\r
+ if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];\r
+ if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];\r
+ if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];\r
+ if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];\r
+ if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];\r
+ }\r
+\r
+ if (threadIdx.x == 0)\r
+ labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);\r
+ }\r
\r
- // Get left top corner of the window in dst\r
- float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);\r
\r
- // Copy elements from src to dst\r
- for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
- {\r
- int offset_y = i / cdescr_width;\r
- int offset_x = i - offset_y * cdescr_width;\r
- descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];\r
- }\r
-}\r
+ void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ float* coefs, float free_coef, float threshold, unsigned char* labels)\r
+ { \r
+ const int nthreads = 256;\r
+ const int nblocks = 1;\r
\r
+ int win_block_stride_x = win_stride_x / block_stride_x;\r
+ int win_block_stride_y = win_stride_y / block_stride_y;\r
+ int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
+ int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
\r
-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, \r
- int height, int width, float* block_hists, DevMem2Df descriptors)\r
-{\r
- const int nthreads = 256;\r
+ dim3 threads(nthreads, 1, nblocks);\r
+ dim3 grid(divUp(img_win_width, nblocks), img_win_height);\r
\r
- int win_block_stride_x = win_stride_x / block_stride_x;\r
- int win_block_stride_y = win_stride_y / block_stride_y;\r
- int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
- int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
- dim3 threads(nthreads, 1);\r
- dim3 grid(img_win_width, img_win_height);\r
+ cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));\r
\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
- extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(\r
- img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
- cudaSafeCall( cudaGetLastError() );\r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
+ classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(\r
+ img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, \r
+ block_hists, coefs, free_coef, threshold, labels);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
+ //----------------------------------------------------------------------------\r
+ // Extract descriptors\r
\r
-template <int nthreads>\r
-__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, \r
- const int win_block_stride_y, const float* block_hists, \r
- PtrElemStepf descriptors)\r
-{\r
- // Get left top corner of the window in src\r
- const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
- blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
\r
- // Get left top corner of the window in dst\r
- float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);\r
+ template <int nthreads>\r
+ __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, \r
+ const float* block_hists, PtrElemStepf descriptors)\r
+ {\r
+ // Get left top corner of the window in src\r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
+ blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
+\r
+ // Get left top corner of the window in dst\r
+ float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);\r
+\r
+ // Copy elements from src to dst\r
+ for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
+ {\r
+ int offset_y = i / cdescr_width;\r
+ int offset_x = i - offset_y * cdescr_width;\r
+ descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];\r
+ }\r
+ }\r
\r
- // Copy elements from src to dst\r
- for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
- {\r
- int block_idx = i / cblock_hist_size;\r
- int idx_in_block = i - block_idx * cblock_hist_size;\r
\r
- int y = block_idx / cnblocks_win_x;\r
- int x = block_idx - y * cnblocks_win_x;\r
+ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, \r
+ int height, int width, float* block_hists, DevMem2Df descriptors)\r
+ {\r
+ const int nthreads = 256;\r
\r
- descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] \r
- = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];\r
- }\r
-}\r
+ int win_block_stride_x = win_stride_x / block_stride_x;\r
+ int win_block_stride_y = win_stride_y / block_stride_y;\r
+ int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
+ int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
+ dim3 threads(nthreads, 1);\r
+ dim3 grid(img_win_width, img_win_height);\r
\r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
+ extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(\r
+ img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
- DevMem2Df descriptors)\r
-{\r
- const int nthreads = 256;\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- int win_block_stride_x = win_stride_x / block_stride_x;\r
- int win_block_stride_y = win_stride_y / block_stride_y;\r
- int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
- int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
- dim3 threads(nthreads, 1);\r
- dim3 grid(img_win_width, img_win_height);\r
\r
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
- extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(\r
- img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <int nthreads>\r
+ __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, \r
+ const int win_block_stride_y, const float* block_hists, \r
+ PtrElemStepf descriptors)\r
+ {\r
+ // Get left top corner of the window in src\r
+ const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + \r
+ blockIdx.x * win_block_stride_x) * cblock_hist_size;\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ // Get left top corner of the window in dst\r
+ float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);\r
\r
-//----------------------------------------------------------------------------\r
-// Gradients computation\r
+ // Copy elements from src to dst\r
+ for (int i = threadIdx.x; i < cdescr_size; i += nthreads)\r
+ {\r
+ int block_idx = i / cblock_hist_size;\r
+ int idx_in_block = i - block_idx * cblock_hist_size;\r
\r
+ int y = block_idx / cnblocks_win_x;\r
+ int x = block_idx - y * cnblocks_win_x;\r
\r
-template <int nthreads, int correct_gamma>\r
-__global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, \r
- float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] \r
+ = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];\r
+ }\r
+ }\r
\r
- const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);\r
\r
- __shared__ float sh_row[(nthreads + 2) * 3];\r
+ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ DevMem2Df descriptors)\r
+ {\r
+ const int nthreads = 256;\r
\r
- uchar4 val;\r
- if (x < width) \r
- val = row[x]; \r
- else \r
- val = row[width - 2];\r
+ int win_block_stride_x = win_stride_x / block_stride_x;\r
+ int win_block_stride_y = win_stride_y / block_stride_y;\r
+ int img_win_width = (width - win_width + win_stride_x) / win_stride_x;\r
+ int img_win_height = (height - win_height + win_stride_y) / win_stride_y;\r
+ dim3 threads(nthreads, 1);\r
+ dim3 grid(img_win_width, img_win_height);\r
\r
- sh_row[threadIdx.x + 1] = val.x;\r
- sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;\r
- sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;\r
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;\r
+ extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(\r
+ img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (threadIdx.x == 0)\r
- {\r
- val = row[::max(x - 1, 1)];\r
- sh_row[0] = val.x;\r
- sh_row[(nthreads + 2)] = val.y;\r
- sh_row[2 * (nthreads + 2)] = val.z;\r
- }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (threadIdx.x == blockDim.x - 1)\r
- {\r
- val = row[::min(x + 1, width - 2)];\r
- sh_row[blockDim.x + 1] = val.x;\r
- sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;\r
- sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;\r
- }\r
-\r
- __syncthreads();\r
- if (x < width)\r
- {\r
- float3 a, b;\r
+ //----------------------------------------------------------------------------\r
+ // Gradients computation\r
\r
- b.x = sh_row[threadIdx.x + 2];\r
- b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];\r
- b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];\r
- a.x = sh_row[threadIdx.x];\r
- a.y = sh_row[threadIdx.x + (nthreads + 2)];\r
- a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];\r
\r
- float3 dx;\r
- if (correct_gamma)\r
- dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); \r
- else\r
- dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); \r
+ template <int nthreads, int correct_gamma>\r
+ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, \r
+ float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+\r
+ const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);\r
+\r
+ __shared__ float sh_row[(nthreads + 2) * 3];\r
+\r
+ uchar4 val;\r
+ if (x < width) \r
+ val = row[x]; \r
+ else \r
+ val = row[width - 2];\r
+\r
+ sh_row[threadIdx.x + 1] = val.x;\r
+ sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;\r
+ sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ val = row[::max(x - 1, 1)];\r
+ sh_row[0] = val.x;\r
+ sh_row[(nthreads + 2)] = val.y;\r
+ sh_row[2 * (nthreads + 2)] = val.z;\r
+ }\r
+\r
+ if (threadIdx.x == blockDim.x - 1)\r
+ {\r
+ val = row[::min(x + 1, width - 2)];\r
+ sh_row[blockDim.x + 1] = val.x;\r
+ sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;\r
+ sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;\r
+ }\r
+\r
+ __syncthreads();\r
+ if (x < width)\r
+ {\r
+ float3 a, b;\r
+\r
+ b.x = sh_row[threadIdx.x + 2];\r
+ b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];\r
+ b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];\r
+ a.x = sh_row[threadIdx.x];\r
+ a.y = sh_row[threadIdx.x + (nthreads + 2)];\r
+ a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];\r
+\r
+ float3 dx;\r
+ if (correct_gamma)\r
+ dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); \r
+ else\r
+ dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); \r
+\r
+ float3 dy = make_float3(0.f, 0.f, 0.f);\r
+\r
+ if (blockIdx.y > 0 && blockIdx.y < height - 1)\r
+ {\r
+ val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];\r
+ a = make_float3(val.x, val.y, val.z);\r
+\r
+ val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];\r
+ b = make_float3(val.x, val.y, val.z);\r
+\r
+ if (correct_gamma)\r
+ dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));\r
+ else\r
+ dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);\r
+ }\r
+\r
+ float best_dx = dx.x;\r
+ float best_dy = dy.x;\r
+\r
+ float mag0 = dx.x * dx.x + dy.x * dy.x;\r
+ float mag1 = dx.y * dx.y + dy.y * dy.y;\r
+ if (mag0 < mag1) \r
+ {\r
+ best_dx = dx.y;\r
+ best_dy = dy.y;\r
+ mag0 = mag1;\r
+ }\r
+\r
+ mag1 = dx.z * dx.z + dy.z * dy.z;\r
+ if (mag0 < mag1)\r
+ {\r
+ best_dx = dx.z;\r
+ best_dy = dy.z;\r
+ mag0 = mag1;\r
+ }\r
+\r
+ mag0 = ::sqrtf(mag0);\r
+\r
+ float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;\r
+ int hidx = (int)::floorf(ang);\r
+ ang -= hidx;\r
+ hidx = (hidx + cnbins) % cnbins;\r
+\r
+ ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);\r
+ ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);\r
+ }\r
+ }\r
\r
- float3 dy = make_float3(0.f, 0.f, 0.f);\r
\r
- if (blockIdx.y > 0 && blockIdx.y < height - 1)\r
+ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, \r
+ float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
{\r
- val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];\r
- a = make_float3(val.x, val.y, val.z);\r
+ const int nthreads = 256;\r
\r
- val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];\r
- b = make_float3(val.x, val.y, val.z);\r
+ dim3 bdim(nthreads, 1);\r
+ dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));\r
\r
if (correct_gamma)\r
- dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));\r
+ compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
else\r
- dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);\r
- }\r
+ compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
\r
- float best_dx = dx.x;\r
- float best_dy = dy.x;\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- float mag0 = dx.x * dx.x + dy.x * dy.x;\r
- float mag1 = dx.y * dx.y + dy.y * dy.y;\r
- if (mag0 < mag1) \r
- {\r
- best_dx = dx.y;\r
- best_dy = dy.y;\r
- mag0 = mag1;\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
\r
- mag1 = dx.z * dx.z + dy.z * dy.z;\r
- if (mag0 < mag1)\r
+ template <int nthreads, int correct_gamma>\r
+ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, \r
+ float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
{\r
- best_dx = dx.z;\r
- best_dy = dy.z;\r
- mag0 = mag1;\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+\r
+ const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);\r
+\r
+ __shared__ float sh_row[nthreads + 2];\r
+\r
+ if (x < width) \r
+ sh_row[threadIdx.x + 1] = row[x]; \r
+ else \r
+ sh_row[threadIdx.x + 1] = row[width - 2];\r
+\r
+ if (threadIdx.x == 0)\r
+ sh_row[0] = row[::max(x - 1, 1)];\r
+\r
+ if (threadIdx.x == blockDim.x - 1)\r
+ sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];\r
+\r
+ __syncthreads();\r
+ if (x < width)\r
+ {\r
+ float dx;\r
+\r
+ if (correct_gamma)\r
+ dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);\r
+ else\r
+ dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];\r
+\r
+ float dy = 0.f;\r
+ if (blockIdx.y > 0 && blockIdx.y < height - 1)\r
+ {\r
+ float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];\r
+ float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];\r
+ if (correct_gamma)\r
+ dy = ::sqrtf(a) - ::sqrtf(b);\r
+ else\r
+ dy = a - b;\r
+ }\r
+ float mag = ::sqrtf(dx * dx + dy * dy);\r
+\r
+ float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;\r
+ int hidx = (int)::floorf(ang);\r
+ ang -= hidx;\r
+ hidx = (hidx + cnbins) % cnbins;\r
+\r
+ ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);\r
+ ((float2*) grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);\r
+ }\r
}\r
\r
- mag0 = ::sqrtf(mag0);\r
-\r
- float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;\r
- int hidx = (int)::floorf(ang);\r
- ang -= hidx;\r
- hidx = (hidx + cnbins) % cnbins;\r
-\r
- ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);\r
- ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);\r
- }\r
-}\r
-\r
-\r
-void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, \r
- float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
-{\r
- const int nthreads = 256;\r
-\r
- dim3 bdim(nthreads, 1);\r
- dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));\r
-\r
- if (correct_gamma)\r
- compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
- else\r
- compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
-\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-template <int nthreads, int correct_gamma>\r
-__global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, \r
- float angle_scale, PtrElemStepf grad, PtrElemStep qangle)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-\r
- const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);\r
-\r
- __shared__ float sh_row[nthreads + 2];\r
-\r
- if (x < width) \r
- sh_row[threadIdx.x + 1] = row[x]; \r
- else \r
- sh_row[threadIdx.x + 1] = row[width - 2];\r
\r
- if (threadIdx.x == 0)\r
- sh_row[0] = row[::max(x - 1, 1)];\r
-\r
- if (threadIdx.x == blockDim.x - 1)\r
- sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];\r
-\r
- __syncthreads();\r
- if (x < width)\r
- {\r
- float dx;\r
+ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, \r
+ float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
+ {\r
+ const int nthreads = 256;\r
\r
- if (correct_gamma)\r
- dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);\r
- else\r
- dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];\r
+ dim3 bdim(nthreads, 1);\r
+ dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));\r
\r
- float dy = 0.f;\r
- if (blockIdx.y > 0 && blockIdx.y < height - 1)\r
- {\r
- float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];\r
- float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];\r
if (correct_gamma)\r
- dy = ::sqrtf(a) - ::sqrtf(b);\r
+ compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
else\r
- dy = a - b;\r
- }\r
- float mag = ::sqrtf(dx * dx + dy * dy);\r
-\r
- float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;\r
- int hidx = (int)::floorf(ang);\r
- ang -= hidx;\r
- hidx = (hidx + cnbins) % cnbins;\r
-\r
- ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);\r
- ((float2*) grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);\r
- }\r
-}\r
-\r
-\r
-void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, \r
- float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)\r
-{\r
- const int nthreads = 256;\r
-\r
- dim3 bdim(nthreads, 1);\r
- dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));\r
-\r
- if (correct_gamma)\r
- compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
- else\r
- compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
+ compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);\r
\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
\r
-//-------------------------------------------------------------------\r
-// Resize\r
\r
-texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;\r
-texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;\r
+ //-------------------------------------------------------------------\r
+ // Resize\r
\r
-__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)\r
-{\r
- unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;\r
+ texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;\r
\r
- if (x < dst.cols && y < dst.rows)\r
- dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;\r
-}\r
+ __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)\r
+ {\r
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)\r
-{\r
- unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if (x < dst.cols && y < dst.rows)\r
- { \r
- float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);\r
- dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);\r
- }\r
-}\r
-\r
-template<class T, class TEX> \r
-static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)\r
-{\r
- tex.filterMode = cudaFilterModeLinear;\r
+ if (x < dst.cols && y < dst.rows)\r
+ dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;\r
+ }\r
\r
- size_t texOfs = 0;\r
- int colOfs = 0;\r
+ __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)\r
+ {\r
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < dst.cols && y < dst.rows)\r
+ { \r
+ float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);\r
+ dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);\r
+ }\r
+ }\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); \r
- cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
+ template<class T, class TEX> \r
+ static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)\r
+ {\r
+ tex.filterMode = cudaFilterModeLinear;\r
\r
- if (texOfs != 0) \r
- {\r
- colOfs = static_cast<int>( texOfs/sizeof(T) );\r
- cudaSafeCall( cudaUnbindTexture(tex) );\r
- cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
- } \r
+ size_t texOfs = 0;\r
+ int colOfs = 0;\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));\r
- \r
- float sx = static_cast<float>(src.cols) / dst.cols;\r
- float sy = static_cast<float>(src.rows) / dst.rows;\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); \r
+ cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
\r
- resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (texOfs != 0) \r
+ {\r
+ colOfs = static_cast<int>( texOfs/sizeof(T) );\r
+ cudaSafeCall( cudaUnbindTexture(tex) );\r
+ cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );\r
+ } \r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));\r
+ \r
+ float sx = static_cast<float>(src.cols) / dst.cols;\r
+ float sy = static_cast<float>(src.rows) / dst.rows;\r
\r
- cudaSafeCall( cudaUnbindTexture(tex) );\r
-}\r
+ resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }\r
-void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
-} // namespace hog \r
+ cudaSafeCall( cudaUnbindTexture(tex) );\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }\r
+ void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }\r
+ } // namespace hog \r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
-\r
-/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////\r
-\r
-texture<uchar4, 2> tex_meanshift;\r
-\r
-__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, \r
- size_t out_step, int cols, int rows, \r
- int sp, int sr, int maxIter, float eps)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- int isr2 = sr*sr;\r
- uchar4 c = tex2D(tex_meanshift, x0, y0 );\r
-\r
- // iterate meanshift procedure\r
- for( int iter = 0; iter < maxIter; iter++ )\r
+ namespace imgproc \r
{\r
- int count = 0;\r
- int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;\r
- float icount;\r
+ /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////\r
\r
- //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)\r
- int minx = x0-sp;\r
- int miny = y0-sp;\r
- int maxx = x0+sp;\r
- int maxy = y0+sp;\r
+ texture<uchar4, 2> tex_meanshift;\r
\r
- for( int y = miny; y <= maxy; y++)\r
+ __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, \r
+ size_t out_step, int cols, int rows, \r
+ int sp, int sr, int maxIter, float eps)\r
{\r
- int rowCount = 0;\r
- for( int x = minx; x <= maxx; x++ )\r
- { \r
- uchar4 t = tex2D( tex_meanshift, x, y );\r
-\r
- int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);\r
- if( norm2 <= isr2 )\r
- {\r
- s0 += t.x; s1 += t.y; s2 += t.z;\r
- sx += x; rowCount++;\r
- }\r
- }\r
- count += rowCount;\r
- sy += y*rowCount;\r
- }\r
-\r
- if( count == 0 )\r
- break;\r
+ int isr2 = sr*sr;\r
+ uchar4 c = tex2D(tex_meanshift, x0, y0 );\r
\r
- icount = 1.f/count;\r
- int x1 = __float2int_rz(sx*icount);\r
- int y1 = __float2int_rz(sy*icount);\r
- s0 = __float2int_rz(s0*icount);\r
- s1 = __float2int_rz(s1*icount);\r
- s2 = __float2int_rz(s2*icount);\r
+ // iterate meanshift procedure\r
+ for( int iter = 0; iter < maxIter; iter++ )\r
+ {\r
+ int count = 0;\r
+ int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;\r
+ float icount;\r
\r
- int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);\r
+ //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)\r
+ int minx = x0-sp;\r
+ int miny = y0-sp;\r
+ int maxx = x0+sp;\r
+ int maxy = y0+sp;\r
\r
- bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);\r
+ for( int y = miny; y <= maxy; y++)\r
+ {\r
+ int rowCount = 0;\r
+ for( int x = minx; x <= maxx; x++ )\r
+ { \r
+ uchar4 t = tex2D( tex_meanshift, x, y );\r
+\r
+ int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);\r
+ if( norm2 <= isr2 )\r
+ {\r
+ s0 += t.x; s1 += t.y; s2 += t.z;\r
+ sx += x; rowCount++;\r
+ }\r
+ }\r
+ count += rowCount;\r
+ sy += y*rowCount;\r
+ }\r
\r
- x0 = x1; y0 = y1;\r
- c.x = s0; c.y = s1; c.z = s2;\r
+ if( count == 0 )\r
+ break;\r
\r
- if( stopFlag )\r
- break;\r
- }\r
+ icount = 1.f/count;\r
+ int x1 = __float2int_rz(sx*icount);\r
+ int y1 = __float2int_rz(sy*icount);\r
+ s0 = __float2int_rz(s0*icount);\r
+ s1 = __float2int_rz(s1*icount);\r
+ s2 = __float2int_rz(s2*icount);\r
\r
- int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);\r
- *(uchar4*)(out + base) = c;\r
+ int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);\r
\r
- return make_short2((short)x0, (short)y0);\r
-}\r
+ bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);\r
\r
-__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )\r
-{\r
- int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
+ x0 = x1; y0 = y1;\r
+ c.x = s0; c.y = s1; c.z = s2;\r
\r
- if( x0 < cols && y0 < rows )\r
- do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
-}\r
+ if( stopFlag )\r
+ break;\r
+ }\r
\r
-__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, \r
- unsigned char* outsp, size_t outspstep, \r
- int cols, int rows, \r
- int sp, int sr, int maxIter, float eps)\r
-{\r
- int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
+ int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);\r
+ *(uchar4*)(out + base) = c;\r
\r
- if( x0 < cols && y0 < rows )\r
- { \r
- int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);\r
- *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);\r
- }\r
-}\r
+ return make_short2((short)x0, (short)y0);\r
+ }\r
\r
-void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)\r
-{\r
- dim3 grid(1, 1, 1);\r
- dim3 threads(32, 8, 1);\r
- grid.x = divUp(src.cols, threads.x);\r
- grid.y = divUp(src.rows, threads.y);\r
+ __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )\r
+ {\r
+ int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
- cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
+ if( x0 < cols && y0 < rows )\r
+ do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
+ }\r
\r
- meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
- cudaSafeCall( cudaGetLastError() );\r
+ __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, \r
+ unsigned char* outsp, size_t outspstep, \r
+ int cols, int rows, \r
+ int sp, int sr, int maxIter, float eps)\r
+ {\r
+ int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ if( x0 < cols && y0 < rows )\r
+ { \r
+ int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);\r
+ *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);\r
+ }\r
+ }\r
\r
- //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
-}\r
+ void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)\r
+ {\r
+ dim3 grid(1, 1, 1);\r
+ dim3 threads(32, 8, 1);\r
+ grid.x = divUp(src.cols, threads.x);\r
+ grid.y = divUp(src.rows, threads.y);\r
\r
-void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) \r
-{\r
- dim3 grid(1, 1, 1);\r
- dim3 threads(32, 8, 1);\r
- grid.x = divUp(src.cols, threads.x);\r
- grid.y = divUp(src.rows, threads.y);\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
+ cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
- cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
+ meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
+ }\r
\r
- //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
-}\r
+ void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) \r
+ {\r
+ dim3 grid(1, 1, 1);\r
+ dim3 threads(32, 8, 1);\r
+ grid.x = divUp(src.cols, threads.x);\r
+ grid.y = divUp(src.rows, threads.y);\r
\r
-/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
+ cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
\r
-template <typename T>\r
-__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)\r
-{ \r
- unsigned int H = ((ndisp-d) * 240)/ndisp;\r
+ meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- unsigned int hi = (H/60) % 6;\r
- float f = H/60.f - H/60;\r
- float p = V * (1 - S);\r
- float q = V * (1 - f * S);\r
- float t = V * (1 - (1 - f) * S);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- float3 res;\r
- \r
- if (hi == 0) //R = V, G = t, B = p\r
- {\r
- res.x = p;\r
- res.y = t;\r
- res.z = V;\r
- }\r
+ //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); \r
+ }\r
\r
- if (hi == 1) // R = q, G = V, B = p\r
- {\r
- res.x = p;\r
- res.y = V;\r
- res.z = q;\r
- } \r
- \r
- if (hi == 2) // R = p, G = V, B = t\r
- {\r
- res.x = t;\r
- res.y = V;\r
- res.z = p;\r
- }\r
- \r
- if (hi == 3) // R = p, G = q, B = V\r
- {\r
- res.x = V;\r
- res.y = q;\r
- res.z = p;\r
- }\r
+ /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////\r
\r
- if (hi == 4) // R = t, G = p, B = V\r
- {\r
- res.x = V;\r
- res.y = p;\r
- res.z = t;\r
- }\r
+ template <typename T>\r
+ __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)\r
+ { \r
+ unsigned int H = ((ndisp-d) * 240)/ndisp;\r
\r
- if (hi == 5) // R = V, G = p, B = q\r
- {\r
- res.x = q;\r
- res.y = p;\r
- res.z = V;\r
- }\r
- const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);\r
- const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);\r
- const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);\r
- const unsigned int a = 255U;\r
-\r
- return (a << 24) + (r << 16) + (g << 8) + b; \r
-} \r
-\r
-__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
-{\r
- const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ unsigned int hi = (H/60) % 6;\r
+ float f = H/60.f - H/60;\r
+ float p = V * (1 - S);\r
+ float q = V * (1 - f * S);\r
+ float t = V * (1 - (1 - f) * S);\r
\r
- if(x < width && y < height) \r
- {\r
- uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);\r
+ float3 res;\r
+ \r
+ if (hi == 0) //R = V, G = t, B = p\r
+ {\r
+ res.x = p;\r
+ res.y = t;\r
+ res.z = V;\r
+ }\r
\r
- uint4 res;\r
- res.x = cvtPixel(d4.x, ndisp);\r
- res.y = cvtPixel(d4.y, ndisp);\r
- res.z = cvtPixel(d4.z, ndisp);\r
- res.w = cvtPixel(d4.w, ndisp);\r
+ if (hi == 1) // R = q, G = V, B = p\r
+ {\r
+ res.x = p;\r
+ res.y = V;\r
+ res.z = q;\r
+ } \r
+ \r
+ if (hi == 2) // R = p, G = V, B = t\r
+ {\r
+ res.x = t;\r
+ res.y = V;\r
+ res.z = p;\r
+ }\r
\r
- uint4* line = (uint4*)(out_image + y * out_step);\r
- line[x >> 2] = res;\r
- }\r
-}\r
-\r
-__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
-{\r
- const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (hi == 3) // R = p, G = q, B = V\r
+ {\r
+ res.x = V;\r
+ res.y = q;\r
+ res.z = p;\r
+ }\r
\r
- if(x < width && y < height) \r
- {\r
- short2 d2 = *(short2*)(disp + y * disp_step + x);\r
+ if (hi == 4) // R = t, G = p, B = V\r
+ {\r
+ res.x = V;\r
+ res.y = p;\r
+ res.z = t;\r
+ }\r
\r
- uint2 res;\r
- res.x = cvtPixel(d2.x, ndisp); \r
- res.y = cvtPixel(d2.y, ndisp);\r
+ if (hi == 5) // R = V, G = p, B = q\r
+ {\r
+ res.x = q;\r
+ res.y = p;\r
+ res.z = V;\r
+ }\r
+ const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);\r
+ const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);\r
+ const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);\r
+ const unsigned int a = 255U;\r
\r
- uint2* line = (uint2*)(out_image + y * out_step);\r
- line[x >> 1] = res;\r
- }\r
-}\r
+ return (a << 24) + (r << 16) + (g << 8) + b; \r
+ } \r
\r
+ __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+ {\r
+ const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
-{\r
- dim3 threads(16, 16, 1);\r
- dim3 grid(1, 1, 1);\r
- grid.x = divUp(src.cols, threads.x << 2);\r
- grid.y = divUp(src.rows, threads.y);\r
- \r
- drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
-}\r
-\r
-void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
- grid.x = divUp(src.cols, threads.x << 1);\r
- grid.y = divUp(src.rows, threads.y);\r
- \r
- drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
- cudaSafeCall( cudaGetLastError() );\r
- \r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////\r
-\r
-__constant__ float cq[16];\r
-\r
-template <typename T>\r
-__global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)\r
-{ \r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if (y < rows && x < cols)\r
- {\r
+ if(x < width && y < height) \r
+ {\r
+ uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);\r
+\r
+ uint4 res;\r
+ res.x = cvtPixel(d4.x, ndisp);\r
+ res.y = cvtPixel(d4.y, ndisp);\r
+ res.z = cvtPixel(d4.z, ndisp);\r
+ res.w = cvtPixel(d4.w, ndisp);\r
+ \r
+ uint4* line = (uint4*)(out_image + y * out_step);\r
+ line[x >> 2] = res;\r
+ }\r
+ }\r
\r
- float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];\r
- float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];\r
+ __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+ {\r
+ const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- qx += x * cq[0]; \r
- qy += x * cq[4];\r
- qz += x * cq[8];\r
- qw += x * cq[12];\r
+ if(x < width && y < height) \r
+ {\r
+ short2 d2 = *(short2*)(disp + y * disp_step + x);\r
\r
- T d = *(disp + disp_step * y + x);\r
+ uint2 res;\r
+ res.x = cvtPixel(d2.x, ndisp); \r
+ res.y = cvtPixel(d2.y, ndisp);\r
\r
- float iW = 1.f / (qw + cq[14] * d);\r
- float4 v;\r
- v.x = (qx + cq[2] * d) * iW;\r
- v.y = (qy + cq[6] * d) * iW;\r
- v.z = (qz + cq[10] * d) * iW;\r
- v.w = 1.f;\r
+ uint2* line = (uint2*)(out_image + y * out_step);\r
+ line[x >> 1] = res;\r
+ }\r
+ }\r
\r
- *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;\r
- }\r
-}\r
\r
-template <typename T>\r
-inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
- grid.x = divUp(disp.cols, threads.x);\r
- grid.y = divUp(disp.rows, threads.y);\r
+ void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
+ {\r
+ dim3 threads(16, 16, 1);\r
+ dim3 grid(1, 1, 1);\r
+ grid.x = divUp(src.cols, threads.x << 2);\r
+ grid.y = divUp(src.rows, threads.y);\r
+ \r
+ drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \r
+ }\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
+ void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
+ grid.x = divUp(src.cols, threads.x << 1);\r
+ grid.y = divUp(src.rows, threads.y);\r
+ \r
+ drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ \r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
+ /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ __constant__ float cq[16];\r
\r
-void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-{\r
- reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
-}\r
+ template <typename T>\r
+ __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)\r
+ { \r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-{\r
- reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
-}\r
+ if (y < rows && x < cols)\r
+ {\r
\r
-//////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////\r
+ float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];\r
+ float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];\r
\r
-__global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, \r
- const PtrStepf Dy, PtrStepf dst)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ qx += x * cq[0]; \r
+ qy += x * cq[4];\r
+ qz += x * cq[8];\r
+ qw += x * cq[12];\r
\r
- if (x < cols && y < rows)\r
- { \r
- float dx = Dx.ptr(y)[x];\r
- float dy = Dy.ptr(y)[x];\r
+ T d = *(disp + disp_step * y + x);\r
\r
- dst.ptr(y)[x] = dx * dx;\r
- dst.ptr(y + rows)[x] = dx * dy;\r
- dst.ptr(y + (rows << 1))[x] = dy * dy;\r
- }\r
-}\r
+ float iW = 1.f / (qw + cq[14] * d);\r
+ float4 v;\r
+ v.x = (qx + cq[2] * d) * iW;\r
+ v.y = (qy + cq[6] * d) * iW;\r
+ v.z = (qz + cq[10] * d) * iW;\r
+ v.w = 1.f;\r
\r
-void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
+ *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;\r
+ }\r
+ }\r
\r
- extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
+ grid.x = divUp(disp.cols, threads.x);\r
+ grid.y = divUp(disp.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
\r
-/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////\r
+ reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-texture<float, 2> harrisDxTex;\r
-texture<float, 2> harrisDyTex;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
- PtrStepb dst)\r
-{\r
- const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+ {\r
+ reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
+ }\r
\r
- if (x < cols && y < rows)\r
- {\r
- float a = 0.f;\r
- float b = 0.f;\r
- float c = 0.f;\r
+ void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+ {\r
+ reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
+ }\r
\r
- const int ibegin = y - (block_size / 2);\r
- const int jbegin = x - (block_size / 2);\r
- const int iend = ibegin + block_size;\r
- const int jend = jbegin + block_size;\r
+ //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////\r
\r
- for (int i = ibegin; i < iend; ++i)\r
+ __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, \r
+ const PtrStepf Dy, PtrStepf dst)\r
{\r
- for (int j = jbegin; j < jend; ++j)\r
- {\r
- float dx = tex2D(harrisDxTex, j, i);\r
- float dy = tex2D(harrisDyTex, j, i);\r
- a += dx * dx;\r
- b += dx * dy;\r
- c += dy * dy;\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows)\r
+ { \r
+ float dx = Dx.ptr(y)[x];\r
+ float dy = Dy.ptr(y)[x];\r
+\r
+ dst.ptr(y)[x] = dx * dx;\r
+ dst.ptr(y + rows)[x] = dx * dy;\r
+ dst.ptr(y + (rows << 1))[x] = dy * dy;\r
}\r
}\r
\r
- ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
- }\r
-}\r
+ void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
\r
-template <typename BR, typename BC>\r
-__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
- PtrStepb dst, BR border_row, BC border_col)\r
-{\r
- const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (x < cols && y < rows)\r
- {\r
- float a = 0.f;\r
- float b = 0.f;\r
- float c = 0.f;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+\r
+ /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////\r
\r
- const int ibegin = y - (block_size / 2);\r
- const int jbegin = x - (block_size / 2);\r
- const int iend = ibegin + block_size;\r
- const int jend = jbegin + block_size;\r
+ texture<float, 2> harrisDxTex;\r
+ texture<float, 2> harrisDyTex;\r
\r
- for (int i = ibegin; i < iend; ++i)\r
+ __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
+ PtrStepb dst)\r
{\r
- int y = border_col.idx_row(i);\r
- for (int j = jbegin; j < jend; ++j)\r
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows)\r
{\r
- int x = border_row.idx_col(j);\r
- float dx = tex2D(harrisDxTex, x, y);\r
- float dy = tex2D(harrisDyTex, x, y);\r
- a += dx * dx;\r
- b += dx * dy;\r
- c += dy * dy;\r
+ float a = 0.f;\r
+ float b = 0.f;\r
+ float c = 0.f;\r
+\r
+ const int ibegin = y - (block_size / 2);\r
+ const int jbegin = x - (block_size / 2);\r
+ const int iend = ibegin + block_size;\r
+ const int jend = jbegin + block_size;\r
+\r
+ for (int i = ibegin; i < iend; ++i)\r
+ {\r
+ for (int j = jbegin; j < jend; ++j)\r
+ {\r
+ float dx = tex2D(harrisDxTex, j, i);\r
+ float dy = tex2D(harrisDyTex, j, i);\r
+ a += dx * dx;\r
+ b += dx * dy;\r
+ c += dy * dy;\r
+ }\r
+ }\r
+\r
+ ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
}\r
}\r
\r
- ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
- }\r
-}\r
+ template <typename BR, typename BC>\r
+ __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
+ PtrStepb dst, BR border_row, BC border_col)\r
+ {\r
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, \r
- int border_type, cudaStream_t stream)\r
-{\r
- const int rows = Dx.rows;\r
- const int cols = Dx.cols;\r
+ if (x < cols && y < rows)\r
+ {\r
+ float a = 0.f;\r
+ float b = 0.f;\r
+ float c = 0.f;\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ const int ibegin = y - (block_size / 2);\r
+ const int jbegin = x - (block_size / 2);\r
+ const int iend = ibegin + block_size;\r
+ const int jend = jbegin + block_size;\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
- cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
- cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
- harrisDxTex.filterMode = cudaFilterModePoint;\r
- harrisDyTex.filterMode = cudaFilterModePoint;\r
+ for (int i = ibegin; i < iend; ++i)\r
+ {\r
+ int y = border_col.idx_row(i);\r
+ for (int j = jbegin; j < jend; ++j)\r
+ {\r
+ int x = border_row.idx_col(j);\r
+ float dx = tex2D(harrisDxTex, x, y);\r
+ float dy = tex2D(harrisDyTex, x, y);\r
+ a += dx * dx;\r
+ b += dx * dy;\r
+ c += dy * dy;\r
+ }\r
+ }\r
\r
- switch (border_type) \r
- {\r
- case BORDER_REFLECT101_GPU:\r
- cornerHarris_kernel<<<grid, threads, 0, stream>>>(\r
- cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
- break;\r
- case BORDER_REPLICATE_GPU:\r
- harrisDxTex.addressMode[0] = cudaAddressModeClamp;\r
- harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
- harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
- harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
+ ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
+ }\r
+ }\r
\r
- cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);\r
- break;\r
- }\r
+ void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, \r
+ int border_type, cudaStream_t stream)\r
+ {\r
+ const int rows = Dx.rows;\r
+ const int cols = Dx.cols;\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
+ cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
+ cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
+ harrisDxTex.filterMode = cudaFilterModePoint;\r
+ harrisDyTex.filterMode = cudaFilterModePoint;\r
\r
- //cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
- //cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
-}\r
+ switch (border_type) \r
+ {\r
+ case BORDER_REFLECT101_GPU:\r
+ cornerHarris_kernel<<<grid, threads, 0, stream>>>(\r
+ cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
+ break;\r
+ case BORDER_REPLICATE_GPU:\r
+ harrisDxTex.addressMode[0] = cudaAddressModeClamp;\r
+ harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
+ harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
+ harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
+\r
+ cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);\r
+ break;\r
+ }\r
\r
-/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-texture<float, 2> minEigenValDxTex;\r
-texture<float, 2> minEigenValDyTex;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
-__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
- PtrStepb dst)\r
-{\r
- const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ //cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
+ //cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
+ }\r
\r
- if (x < cols && y < rows)\r
- {\r
- float a = 0.f;\r
- float b = 0.f;\r
- float c = 0.f;\r
+ /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////\r
\r
- const int ibegin = y - (block_size / 2);\r
- const int jbegin = x - (block_size / 2);\r
- const int iend = ibegin + block_size;\r
- const int jend = jbegin + block_size;\r
+ texture<float, 2> minEigenValDxTex;\r
+ texture<float, 2> minEigenValDyTex;\r
\r
- for (int i = ibegin; i < iend; ++i)\r
+ __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
+ PtrStepb dst)\r
{\r
- for (int j = jbegin; j < jend; ++j)\r
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows)\r
{\r
- float dx = tex2D(minEigenValDxTex, j, i);\r
- float dy = tex2D(minEigenValDyTex, j, i);\r
- a += dx * dx;\r
- b += dx * dy;\r
- c += dy * dy;\r
+ float a = 0.f;\r
+ float b = 0.f;\r
+ float c = 0.f;\r
+\r
+ const int ibegin = y - (block_size / 2);\r
+ const int jbegin = x - (block_size / 2);\r
+ const int iend = ibegin + block_size;\r
+ const int jend = jbegin + block_size;\r
+\r
+ for (int i = ibegin; i < iend; ++i)\r
+ {\r
+ for (int j = jbegin; j < jend; ++j)\r
+ {\r
+ float dx = tex2D(minEigenValDxTex, j, i);\r
+ float dy = tex2D(minEigenValDyTex, j, i);\r
+ a += dx * dx;\r
+ b += dx * dy;\r
+ c += dy * dy;\r
+ }\r
+ }\r
+\r
+ a *= 0.5f;\r
+ c *= 0.5f;\r
+ ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
}\r
}\r
\r
- a *= 0.5f;\r
- c *= 0.5f;\r
- ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
- }\r
-}\r
\r
+ template <typename BR, typename BC>\r
+ __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
+ PtrStepb dst, BR border_row, BC border_col)\r
+ {\r
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-template <typename BR, typename BC>\r
-__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
- PtrStepb dst, BR border_row, BC border_col)\r
-{\r
- const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (x < cols && y < rows)\r
+ {\r
+ float a = 0.f;\r
+ float b = 0.f;\r
+ float c = 0.f;\r
\r
- if (x < cols && y < rows)\r
- {\r
- float a = 0.f;\r
- float b = 0.f;\r
- float c = 0.f;\r
+ const int ibegin = y - (block_size / 2);\r
+ const int jbegin = x - (block_size / 2);\r
+ const int iend = ibegin + block_size;\r
+ const int jend = jbegin + block_size;\r
\r
- const int ibegin = y - (block_size / 2);\r
- const int jbegin = x - (block_size / 2);\r
- const int iend = ibegin + block_size;\r
- const int jend = jbegin + block_size;\r
+ for (int i = ibegin; i < iend; ++i)\r
+ {\r
+ int y = border_col.idx_row(i);\r
+ for (int j = jbegin; j < jend; ++j)\r
+ {\r
+ int x = border_row.idx_col(j);\r
+ float dx = tex2D(minEigenValDxTex, x, y);\r
+ float dy = tex2D(minEigenValDyTex, x, y);\r
+ a += dx * dx;\r
+ b += dx * dy;\r
+ c += dy * dy;\r
+ }\r
+ }\r
\r
- for (int i = ibegin; i < iend; ++i)\r
- {\r
- int y = border_col.idx_row(i);\r
- for (int j = jbegin; j < jend; ++j)\r
- {\r
- int x = border_row.idx_col(j);\r
- float dx = tex2D(minEigenValDxTex, x, y);\r
- float dy = tex2D(minEigenValDyTex, x, y);\r
- a += dx * dx;\r
- b += dx * dy;\r
- c += dy * dy;\r
+ a *= 0.5f;\r
+ c *= 0.5f;\r
+ ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
}\r
}\r
\r
- a *= 0.5f;\r
- c *= 0.5f;\r
- ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
- }\r
-}\r
+ void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,\r
+ int border_type, cudaStream_t stream)\r
+ {\r
+ const int rows = Dx.rows;\r
+ const int cols = Dx.cols;\r
\r
-void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,\r
- int border_type, cudaStream_t stream)\r
-{\r
- const int rows = Dx.rows;\r
- const int cols = Dx.cols;\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
+ cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
+ cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
+ minEigenValDxTex.filterMode = cudaFilterModePoint;\r
+ minEigenValDyTex.filterMode = cudaFilterModePoint;\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
- cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
- cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
- minEigenValDxTex.filterMode = cudaFilterModePoint;\r
- minEigenValDyTex.filterMode = cudaFilterModePoint;\r
+ switch (border_type)\r
+ {\r
+ case BORDER_REFLECT101_GPU:\r
+ cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(\r
+ cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
+ break;\r
+ case BORDER_REPLICATE_GPU:\r
+ minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;\r
+ minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
+ minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
+ minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
+\r
+ cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);\r
+ break;\r
+ }\r
\r
- switch (border_type)\r
- {\r
- case BORDER_REFLECT101_GPU:\r
- cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(\r
- cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
- break;\r
- case BORDER_REPLICATE_GPU:\r
- minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;\r
- minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
- minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
- minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);\r
- break;\r
- }\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
+ //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
+ ////////////////////////////// Column Sum //////////////////////////////////////\r
\r
- //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
- //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
-}\r
+ __global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
\r
-////////////////////////////// Column Sum //////////////////////////////////////\r
+ if (x < cols)\r
+ {\r
+ const unsigned char* src_data = src.data + x * sizeof(float);\r
+ unsigned char* dst_data = dst.data + x * sizeof(float);\r
\r
-__global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ float sum = 0.f;\r
+ for (int y = 0; y < rows; ++y)\r
+ {\r
+ sum += *(const float*)src_data;\r
+ *(float*)dst_data = sum;\r
+ src_data += src.step;\r
+ dst_data += dst.step;\r
+ }\r
+ }\r
+ }\r
\r
- if (x < cols)\r
- {\r
- const unsigned char* src_data = src.data + x * sizeof(float);\r
- unsigned char* dst_data = dst.data + x * sizeof(float);\r
\r
- float sum = 0.f;\r
- for (int y = 0; y < rows; ++y)\r
+ void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)\r
{\r
- sum += *(const float*)src_data;\r
- *(float*)dst_data = sum;\r
- src_data += src.step;\r
- dst_data += dst.step;\r
- }\r
- }\r
-}\r
+ dim3 threads(256);\r
+ dim3 grid(divUp(src.cols, threads.x));\r
\r
+ column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)\r
-{\r
- dim3 threads(256);\r
- dim3 grid(divUp(src.cols, threads.x));\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // mulSpectrums\r
\r
+ __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x; \r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y; \r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// mulSpectrums\r
+ if (x < c.cols && y < c.rows) \r
+ {\r
+ c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
+ }\r
+ }\r
\r
-__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x; \r
- const int y = blockIdx.y * blockDim.y + threadIdx.y; \r
\r
- if (x < c.cols && y < c.rows) \r
- {\r
- c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
- }\r
-}\r
+ void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+ {\r
+ dim3 threads(256);\r
+ dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
+ mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-{\r
- dim3 threads(256);\r
- dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // mulSpectrums_CONJ\r
\r
+ __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x; \r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y; \r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// mulSpectrums_CONJ\r
+ if (x < c.cols && y < c.rows) \r
+ {\r
+ c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
+ }\r
+ }\r
\r
-__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x; \r
- const int y = blockIdx.y * blockDim.y + threadIdx.y; \r
\r
- if (x < c.cols && y < c.rows) \r
- {\r
- c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
- }\r
-}\r
+ void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+ {\r
+ dim3 threads(256);\r
+ dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
+ mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-{\r
- dim3 threads(256);\r
- dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // mulAndScaleSpectrums\r
\r
+ __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// mulAndScaleSpectrums\r
+ if (x < c.cols && y < c.rows) \r
+ {\r
+ cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
+ c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
+ }\r
+ }\r
\r
-__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (x < c.cols && y < c.rows) \r
- {\r
- cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
- c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
- }\r
-}\r
+ void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+ {\r
+ dim3 threads(256);\r
+ dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
+ mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-{\r
- dim3 threads(256);\r
- dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
-\r
- mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // mulAndScaleSpectrums_CONJ\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// mulAndScaleSpectrums_CONJ\r
+ __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (x < c.cols && y < c.rows) \r
+ {\r
+ cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
+ c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
+ }\r
+ }\r
\r
- if (x < c.cols && y < c.rows) \r
- {\r
- cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
- c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
- }\r
-}\r
\r
+ void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+ {\r
+ dim3 threads(256);\r
+ dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
\r
-void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-{\r
- dim3 threads(256);\r
- dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+ mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ } \r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-} \r
+ //////////////////////////////////////////////////////////////////////////\r
+ // buildWarpMaps\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// buildWarpMaps\r
+ // TODO use intrinsics like __sinf and so on\r
\r
-// TODO use intrinsics like __sinf and so on\r
+ namespace build_warp_maps\r
+ {\r
\r
-namespace build_warp_maps\r
-{\r
+ __constant__ float ck_rinv[9];\r
+ __constant__ float cr_kinv[9];\r
+ __constant__ float ct[3];\r
+ __constant__ float cscale;\r
+ }\r
\r
- __constant__ float ck_rinv[9];\r
- __constant__ float cr_kinv[9];\r
- __constant__ float ct[3];\r
- __constant__ float cscale;\r
-}\r
\r
+ class PlaneMapper\r
+ {\r
+ public:\r
+ static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
+ {\r
+ using namespace build_warp_maps;\r
\r
-class PlaneMapper\r
-{\r
-public:\r
- static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
- {\r
- using namespace build_warp_maps;\r
+ float x_ = u / cscale - ct[0];\r
+ float y_ = v / cscale - ct[1];\r
\r
- float x_ = u / cscale - ct[0];\r
- float y_ = v / cscale - ct[1];\r
+ float z;\r
+ x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);\r
+ y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);\r
+ z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);\r
\r
- float z;\r
- x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);\r
- y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);\r
- z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);\r
+ x /= z;\r
+ y /= z;\r
+ }\r
+ };\r
\r
- x /= z;\r
- y /= z;\r
- }\r
-};\r
\r
+ class CylindricalMapper\r
+ {\r
+ public:\r
+ static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
+ {\r
+ using namespace build_warp_maps;\r
\r
-class CylindricalMapper\r
-{\r
-public:\r
- static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
- {\r
- using namespace build_warp_maps;\r
+ u /= cscale;\r
+ float x_ = ::sinf(u);\r
+ float y_ = v / cscale;\r
+ float z_ = ::cosf(u);\r
\r
- u /= cscale;\r
- float x_ = ::sinf(u);\r
- float y_ = v / cscale;\r
- float z_ = ::cosf(u);\r
+ float z;\r
+ x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
+ y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
+ z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
\r
- float z;\r
- x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
- y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
- z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
+ if (z > 0) { x /= z; y /= z; }\r
+ else x = y = -1;\r
+ }\r
+ };\r
\r
- if (z > 0) { x /= z; y /= z; }\r
- else x = y = -1;\r
- }\r
-};\r
\r
+ class SphericalMapper\r
+ {\r
+ public:\r
+ static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
+ {\r
+ using namespace build_warp_maps;\r
\r
-class SphericalMapper\r
-{\r
-public:\r
- static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
- {\r
- using namespace build_warp_maps;\r
+ v /= cscale;\r
+ u /= cscale;\r
\r
- v /= cscale;\r
- u /= cscale;\r
+ float sinv = ::sinf(v);\r
+ float x_ = sinv * ::sinf(u);\r
+ float y_ = -::cosf(v);\r
+ float z_ = sinv * ::cosf(u);\r
\r
- float sinv = ::sinf(v);\r
- float x_ = sinv * ::sinf(u);\r
- float y_ = -::cosf(v);\r
- float z_ = sinv * ::cosf(u);\r
+ float z;\r
+ x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
+ y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
+ z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
\r
- float z;\r
- x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
- y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
- z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
+ if (z > 0) { x /= z; y /= z; }\r
+ else x = y = -1;\r
+ }\r
+ };\r
\r
- if (z > 0) { x /= z; y /= z; }\r
- else x = y = -1;\r
- }\r
-};\r
\r
+ template <typename Mapper>\r
+ __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,\r
+ PtrStepf map_x, PtrStepf map_y)\r
+ {\r
+ int du = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int dv = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (du < cols && dv < rows)\r
+ {\r
+ float u = tl_u + du;\r
+ float v = tl_v + dv;\r
+ float x, y;\r
+ Mapper::mapBackward(u, v, x, y);\r
+ map_x.ptr(dv)[du] = x;\r
+ map_y.ptr(dv)[du] = y;\r
+ }\r
+ }\r
\r
-template <typename Mapper>\r
-__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,\r
- PtrStepf map_x, PtrStepf map_y)\r
-{\r
- int du = blockIdx.x * blockDim.x + threadIdx.x;\r
- int dv = blockIdx.y * blockDim.y + threadIdx.y;\r
- if (du < cols && dv < rows)\r
- {\r
- float u = tl_u + du;\r
- float v = tl_v + dv;\r
- float x, y;\r
- Mapper::mapBackward(u, v, x, y);\r
- map_x.ptr(dv)[du] = x;\r
- map_y.ptr(dv)[du] = y;\r
- }\r
-}\r
-\r
-\r
-void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], const float t[3], \r
- float scale, cudaStream_t stream)\r
-{\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- int cols = map_x.cols;\r
- int rows = map_x.rows;\r
+ void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], const float t[3], \r
+ float scale, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ int cols = map_x.cols;\r
+ int rows = map_x.rows;\r
\r
- buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
- cudaSafeCall(cudaGetLastError());\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
+ buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+ cudaSafeCall(cudaGetLastError());\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
\r
-void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], float scale,\r
- cudaStream_t stream)\r
-{\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- int cols = map_x.cols;\r
- int rows = map_x.rows;\r
+ void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], float scale,\r
+ cudaStream_t stream)\r
+ {\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ int cols = map_x.cols;\r
+ int rows = map_x.rows;\r
\r
- buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
- cudaSafeCall(cudaGetLastError());\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
+ buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+ cudaSafeCall(cudaGetLastError());\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
\r
-void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], float scale,\r
- cudaStream_t stream)\r
-{\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
- cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- int cols = map_x.cols;\r
- int rows = map_x.rows;\r
+ void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], float scale,\r
+ cudaStream_t stream)\r
+ {\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+ cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ int cols = map_x.cols;\r
+ int rows = map_x.rows;\r
\r
- buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
- cudaSafeCall(cudaGetLastError());\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
\r
+ buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+ cudaSafeCall(cudaGetLastError());\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// convolve\r
\r
-#define CONVOLVE_MAX_KERNEL_SIZE 17\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // convolve\r
\r
-__constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];\r
+ #define CONVOLVE_MAX_KERNEL_SIZE 17\r
\r
-__global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)\r
-{\r
- __shared__ float smem[16 + 2 * 8][16 + 2 * 8];\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- // x | x 0 | 0\r
- // -----------\r
- // x | x 0 | 0\r
- // 0 | 0 0 | 0\r
- // -----------\r
- // 0 | 0 0 | 0\r
- smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
-\r
- // 0 | 0 x | x\r
- // -----------\r
- // 0 | 0 x | x\r
- // 0 | 0 0 | 0\r
- // -----------\r
- // 0 | 0 0 | 0\r
- smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];\r
-\r
- // 0 | 0 0 | 0\r
- // -----------\r
- // 0 | 0 0 | 0\r
- // x | x 0 | 0\r
- // -----------\r
- // x | x 0 | 0\r
- smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
-\r
- // 0 | 0 0 | 0\r
- // -----------\r
- // 0 | 0 0 | 0\r
- // 0 | 0 x | x\r
- // -----------\r
- // 0 | 0 x | x\r
- smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];\r
-\r
- __syncthreads();\r
-\r
- if (x < src.cols && y < src.rows)\r
- {\r
- float res = 0;\r
+ __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];\r
\r
- for (int i = 0; i < kHeight; ++i)\r
+ __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)\r
{\r
- for (int j = 0; j < kWidth; ++j)\r
+ __shared__ float smem[16 + 2 * 8][16 + 2 * 8];\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ // x | x 0 | 0\r
+ // -----------\r
+ // x | x 0 | 0\r
+ // 0 | 0 0 | 0\r
+ // -----------\r
+ // 0 | 0 0 | 0\r
+ smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
+\r
+ // 0 | 0 x | x\r
+ // -----------\r
+ // 0 | 0 x | x\r
+ // 0 | 0 0 | 0\r
+ // -----------\r
+ // 0 | 0 0 | 0\r
+ smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];\r
+\r
+ // 0 | 0 0 | 0\r
+ // -----------\r
+ // 0 | 0 0 | 0\r
+ // x | x 0 | 0\r
+ // -----------\r
+ // x | x 0 | 0\r
+ smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
+\r
+ // 0 | 0 0 | 0\r
+ // -----------\r
+ // 0 | 0 0 | 0\r
+ // 0 | 0 x | x\r
+ // -----------\r
+ // 0 | 0 x | x\r
+ smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];\r
+\r
+ __syncthreads();\r
+\r
+ if (x < src.cols && y < src.rows)\r
{\r
- res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];\r
- }\r
- }\r
-\r
- dst.ptr(y)[x] = res;\r
- }\r
-}\r
+ float res = 0;\r
\r
-void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)\r
-{\r
- cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
+ for (int i = 0; i < kHeight; ++i)\r
+ {\r
+ for (int j = 0; j < kWidth; ++j)\r
+ {\r
+ res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];\r
+ }\r
+ }\r
\r
- const dim3 block(16, 16);\r
- const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+ dst.ptr(y)[x] = res;\r
+ }\r
+ }\r
\r
- convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);\r
- cudaSafeCall(cudaGetLastError());\r
+ void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
+ const dim3 block(16, 16);\r
+ const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
\r
-} // namespace imgproc\r
+ convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);\r
+ cudaSafeCall(cudaGetLastError());\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device {\r
#include "safe_call.hpp"\r
\r
#ifndef CV_PI\r
-#define CV_PI 3.1415926535897932384626433832795f\r
+#define CV_PI 3.1415926535897932384626433832795\r
#endif\r
\r
#ifndef CV_PI_F\r
#endif\r
#endif\r
\r
-#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { \r
-#define END_OPENCV_DEVICE_NAMESPACE }}}\r
-#define OPENCV_DEVICE_NAMESPACE ::cv::gpu::device\r
-#define OPENCV_DEVICE_NAMESPACE_ ::cv::gpu::device:: \r
-\r
#ifdef __CUDACC__\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-typedef unsigned char uchar;\r
-typedef unsigned short ushort;\r
-typedef signed char schar;\r
-typedef unsigned int uint;\r
-\r
-template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
- cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
-}\r
+ typedef unsigned char uchar;\r
+ typedef unsigned short ushort;\r
+ typedef signed char schar;\r
+ typedef unsigned int uint;\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)\r
+ {\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
+ cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
+ }\r
+}}}\r
\r
#endif\r
\r
\r
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }\r
\r
- /*template<class T> static inline void uploadConstant(const char* name, const T& value) \r
- { \r
- cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); \r
- }\r
-\r
- template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) \r
- {\r
- cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); \r
- } */ \r
-\r
- //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)\r
- //{ \r
- // //!!!! const_cast is disabled!\r
- // //!!!! Please use constructor of 'class texture' instead.\r
- //\r
- // //textureReference* tex; \r
- // //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); \r
- // //tex->normalized = normalized;\r
- // //tex->filterMode = filterMode;\r
- // //tex->addressMode[0] = addrMode;\r
- // //tex->addressMode[1] = addrMode;\r
- // \r
- // const textureReference* tex; \r
- // cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
- //\r
- // cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
- // cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
- //}\r
-\r
- //static inline void unbindTexture(const char *name)\r
- //{\r
- // const textureReference* tex; \r
- // cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
- // cudaSafeCall( cudaUnbindTexture(tex) );\r
- //}\r
-\r
- \r
-\r
- //class TextureBinder\r
- //{\r
- //public:\r
- // TextureBinder() : tex_(0) {}\r
- // template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)\r
- // {\r
- // bind(tex, img);\r
- // }\r
- // template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)\r
- // {\r
- // bind(tex_name, img);\r
- // }\r
- // ~TextureBinder() { unbind(); }\r
- //\r
- // template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)\r
- // {\r
- // unbind();\r
- //\r
- // cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
- // cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
- //\r
- // tex_ = tex;\r
- // }\r
- // template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)\r
- // {\r
- // const textureReference* tex; \r
- // cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); \r
- // bind(tex, img);\r
- // }\r
- //\r
- // void unbind()\r
- // {\r
- // if (tex_)\r
- // {\r
- // cudaUnbindTexture(tex_);\r
- // tex_ = 0;\r
- // }\r
- // }\r
- //\r
- //private:\r
- // const textureReference* tex_;\r
- //};\r
-\r
class NppStreamHandler\r
{\r
public:\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/vec_math.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ namespace match_template \r
+ {\r
+ __device__ __forceinline__ float sum(float v) { return v; }\r
+ __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }\r
+ __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }\r
+ __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }\r
+\r
+ __device__ __forceinline__ float first(float v) { return v; }\r
+ __device__ __forceinline__ float first(float2 v) { return v.x; }\r
+ __device__ __forceinline__ float first(float3 v) { return v.x; }\r
+ __device__ __forceinline__ float first(float4 v) { return v.x; }\r
+\r
+ __device__ __forceinline__ float mul(float a, float b) { return a * b; }\r
+ __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }\r
+ __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }\r
+ __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }\r
+\r
+ __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }\r
+ __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }\r
+ __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }\r
+ __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }\r
+\r
+ __device__ __forceinline__ float sub(float a, float b) { return a - b; }\r
+ __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }\r
+ __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }\r
+ __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }\r
+\r
+ __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }\r
+ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }\r
+ __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }\r
+ __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }\r
+\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Naive_CCORR\r
+\r
+ template <typename T, int cn> \r
+ __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
+ {\r
+ typedef typename TypeVec<T, cn>::vec_type Type;\r
+ typedef typename TypeVec<float, cn>::vec_type Typef;\r
\r
-namespace match_template {\r
+ int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
-__device__ __forceinline__ float sum(float v) { return v; }\r
-__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }\r
-__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }\r
-__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ Typef res = VecTraits<Typef>::all(0);\r
\r
-__device__ __forceinline__ float first(float v) { return v; }\r
-__device__ __forceinline__ float first(float2 v) { return v.x; }\r
-__device__ __forceinline__ float first(float3 v) { return v.x; }\r
-__device__ __forceinline__ float first(float4 v) { return v.x; }\r
+ for (int i = 0; i < h; ++i)\r
+ {\r
+ const Type* image_ptr = (const Type*)image.ptr(y + i);\r
+ const Type* templ_ptr = (const Type*)templ.ptr(i);\r
+ for (int j = 0; j < w; ++j)\r
+ res = res + mul(image_ptr[x + j], templ_ptr[j]);\r
+ }\r
\r
-__device__ __forceinline__ float mul(float a, float b) { return a * b; }\r
-__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }\r
-__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }\r
-__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }\r
+ result.ptr(y)[x] = sum(res);\r
+ }\r
+ }\r
\r
-__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }\r
-__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }\r
-__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }\r
-__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }\r
+ template <typename T, int cn>\r
+ void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ const dim3 threads(32, 8);\r
+ const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
-__device__ __forceinline__ float sub(float a, float b) { return a - b; }\r
-__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }\r
-__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }\r
-__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }\r
+ matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }\r
-__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }\r
-__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }\r
-__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-//////////////////////////////////////////////////////////////////////\r
-// Naive_CCORR\r
+ void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
-template <typename T, int cn> \r
-__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
-{\r
- typedef typename TypeVec<T, cn>::vec_type Type;\r
- typedef typename TypeVec<float, cn>::vec_type Typef;\r
+ static const caller_t callers[] = \r
+ {\r
+ 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>\r
+ };\r
\r
- int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+ callers[cn](image, templ, result, stream);\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- Typef res = VecTraits<Typef>::all(0);\r
\r
- for (int i = 0; i < h; ++i)\r
+ void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
{\r
- const Type* image_ptr = (const Type*)image.ptr(y + i);\r
- const Type* templ_ptr = (const Type*)templ.ptr(i);\r
- for (int j = 0; j < w; ++j)\r
- res = res + mul(image_ptr[x + j], templ_ptr[j]);\r
+ typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+\r
+ static const caller_t callers[] = \r
+ {\r
+ 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>\r
+ };\r
+\r
+ callers[cn](image, templ, result, stream);\r
}\r
\r
- result.ptr(y)[x] = sum(res);\r
- }\r
-}\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Naive_SQDIFF\r
\r
-template <typename T, int cn>\r
-void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
-{\r
- const dim3 threads(32, 8);\r
- const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ template <typename T, int cn>\r
+ __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
+ {\r
+ typedef typename TypeVec<T, cn>::vec_type Type;\r
+ typedef typename TypeVec<float, cn>::vec_type Typef;\r
\r
- matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ Typef res = VecTraits<Typef>::all(0);\r
+ Typef delta;\r
+\r
+ for (int i = 0; i < h; ++i)\r
+ {\r
+ const Type* image_ptr = (const Type*)image.ptr(y + i);\r
+ const Type* templ_ptr = (const Type*)templ.ptr(i);\r
+ for (int j = 0; j < w; ++j)\r
+ {\r
+ delta = sub(image_ptr[x + j], templ_ptr[j]);\r
+ res = res + delta * delta;\r
+ }\r
+ }\r
+\r
+ result.ptr(y)[x] = sum(res);\r
+ }\r
+ }\r
\r
-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+ template <typename T, int cn>\r
+ void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ const dim3 threads(32, 8);\r
+ const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>\r
- };\r
+ matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- callers[cn](image, templ, result, stream);\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
+ void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+ static const caller_t callers[] = \r
+ {\r
+ 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>\r
+ };\r
\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>\r
- };\r
+ callers[cn](image, templ, result, stream);\r
+ }\r
\r
- callers[cn](image, templ, result, stream);\r
-}\r
+ void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
\r
-//////////////////////////////////////////////////////////////////////\r
-// Naive_SQDIFF\r
+ static const caller_t callers[] = \r
+ {\r
+ 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>\r
+ };\r
\r
-template <typename T, int cn>\r
-__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
-{\r
- typedef typename TypeVec<T, cn>::vec_type Type;\r
- typedef typename TypeVec<float, cn>::vec_type Typef;\r
+ callers[cn](image, templ, result, stream);\r
+ }\r
\r
- int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Prepared_SQDIFF\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- Typef res = VecTraits<Typef>::all(0);\r
- Typef delta;\r
+ template <int cn>\r
+ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sqsum_ = (float)(\r
+ (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
+ (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;\r
+ }\r
+ }\r
\r
- for (int i = 0; i < h; ++i)\r
+ template <int cn>\r
+ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)\r
{\r
- const Type* image_ptr = (const Type*)image.ptr(y + i);\r
- const Type* templ_ptr = (const Type*)templ.ptr(i);\r
- for (int j = 0; j < w; ++j)\r
+ const dim3 threads(32, 8);\r
+ const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+\r
+ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, \r
+ cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
+\r
+ static const caller_t callers[] = \r
{\r
- delta = sub(image_ptr[x + j], templ_ptr[j]);\r
- res = res + delta * delta;\r
+ 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>\r
+ };\r
+\r
+ callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
+ }\r
+\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Prepared_SQDIFF_NORMED\r
+\r
+ // normAcc* are accurate normalization routines which make GPU matchTemplate\r
+ // consistent with CPU one\r
+\r
+ __device__ float normAcc(float num, float denum)\r
+ {\r
+ if (::fabs(num) < denum)\r
+ return num / denum;\r
+ if (::fabs(num) < denum * 1.125f)\r
+ return num > 0 ? 1 : -1;\r
+ return 0;\r
+ }\r
+\r
+\r
+ __device__ float normAcc_SQDIFF(float num, float denum)\r
+ {\r
+ if (::fabs(num) < denum)\r
+ return num / denum;\r
+ if (::fabs(num) < denum * 1.125f)\r
+ return num > 0 ? 1 : -1;\r
+ return 1;\r
+ }\r
+\r
+\r
+ template <int cn>\r
+ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sqsum_ = (float)(\r
+ (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
+ (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,\r
+ sqrtf(image_sqsum_ * templ_sqsum));\r
}\r
}\r
\r
- result.ptr(y)[x] = sum(res);\r
- }\r
-}\r
+ template <int cn>\r
+ void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ const dim3 threads(32, 8);\r
+ const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
-template <typename T, int cn>\r
-void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
-{\r
- const dim3 threads(32, 8);\r
- const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
\r
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
+ DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
+ static const caller_t callers[] = \r
+ {\r
+ 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>\r
+ };\r
\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>\r
- };\r
+ callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
+ }\r
\r
- callers[cn](image, templ, result, stream);\r
-}\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Prepared_CCOFF\r
\r
-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>\r
- };\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_ = (float)(\r
+ (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -\r
+ (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;\r
+ }\r
+ }\r
\r
- callers[cn](image, templ, result, stream);\r
-}\r
+ void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
-//////////////////////////////////////////////////////////////////////\r
-// Prepared_SQDIFF\r
+ matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <int cn>\r
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sqsum_ = (float)(\r
- (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
- (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;\r
- }\r
-}\r
-\r
-template <int cn>\r
-void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)\r
-{\r
- const dim3 threads(32, 8);\r
- const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
- matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(\r
+ int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,\r
+ const PtrStep<unsigned int> image_sum_r,\r
+ const PtrStep<unsigned int> image_sum_g,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, \r
- cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
+ - image_sum_g_ * templ_sum_scale_g;\r
+ }\r
+ }\r
\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>\r
- };\r
+ void matchTemplatePrepared_CCOFF_8UC2(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g,\r
+ unsigned int templ_sum_r, unsigned int templ_sum_g, \r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
- callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
-}\r
+ matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(\r
+ w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),\r
+ image_sum_r, image_sum_g, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-//////////////////////////////////////////////////////////////////////\r
-// Prepared_SQDIFF_NORMED\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-// normAcc* are accurate normalization routines which make GPU matchTemplate\r
-// consistent with CPU one\r
\r
-__device__ float normAcc(float num, float denum)\r
-{\r
- if (::fabs(num) < denum)\r
- return num / denum;\r
- if (::fabs(num) < denum * 1.125f)\r
- return num > 0 ? 1 : -1;\r
- return 0;\r
-}\r
\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(\r
+ int w, int h, \r
+ float templ_sum_scale_r,\r
+ float templ_sum_scale_g,\r
+ float templ_sum_scale_b,\r
+ const PtrStep<unsigned int> image_sum_r,\r
+ const PtrStep<unsigned int> image_sum_g,\r
+ const PtrStep<unsigned int> image_sum_b,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float image_sum_b_ = (float)(\r
+ (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
+ (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r\r
+ - image_sum_g_ * templ_sum_scale_g\r
+ - image_sum_b_ * templ_sum_scale_b;\r
+ }\r
+ }\r
\r
-__device__ float normAcc_SQDIFF(float num, float denum)\r
-{\r
- if (::fabs(num) < denum)\r
- return num / denum;\r
- if (::fabs(num) < denum * 1.125f)\r
- return num > 0 ? 1 : -1;\r
- return 1;\r
-}\r
+ void matchTemplatePrepared_CCOFF_8UC3(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b,\r
+ unsigned int templ_sum_r, \r
+ unsigned int templ_sum_g, \r
+ unsigned int templ_sum_b, \r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(\r
+ w, h, \r
+ (float)templ_sum_r / (w * h),\r
+ (float)templ_sum_g / (w * h),\r
+ (float)templ_sum_b / (w * h),\r
+ image_sum_r, image_sum_g, image_sum_b, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
\r
-template <int cn>\r
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sqsum_ = (float)(\r
- (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
- (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,\r
- sqrtf(image_sqsum_ * templ_sqsum));\r
- }\r
-}\r
-\r
-template <int cn>\r
-void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- const dim3 threads(32, 8);\r
- const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(\r
+ int w, int h, \r
+ float templ_sum_scale_r, \r
+ float templ_sum_scale_g,\r
+ float templ_sum_scale_b,\r
+ float templ_sum_scale_a,\r
+ const PtrStep<unsigned int> image_sum_r,\r
+ const PtrStep<unsigned int> image_sum_g,\r
+ const PtrStep<unsigned int> image_sum_b,\r
+ const PtrStep<unsigned int> image_sum_a,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float image_sum_b_ = (float)(\r
+ (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
+ (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
+ float image_sum_a_ = (float)(\r
+ (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -\r
+ (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
+ - image_sum_g_ * templ_sum_scale_g\r
+ - image_sum_b_ * templ_sum_scale_b\r
+ - image_sum_a_ * templ_sum_scale_a;\r
+ }\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ void matchTemplatePrepared_CCOFF_8UC4(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b,\r
+ const DevMem2D_<unsigned int> image_sum_a,\r
+ unsigned int templ_sum_r, \r
+ unsigned int templ_sum_g, \r
+ unsigned int templ_sum_b, \r
+ unsigned int templ_sum_a, \r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(\r
+ w, h, \r
+ (float)templ_sum_r / (w * h), \r
+ (float)templ_sum_g / (w * h), \r
+ (float)templ_sum_b / (w * h),\r
+ (float)templ_sum_a / (w * h),\r
+ image_sum_r, image_sum_g, image_sum_b, image_sum_a,\r
+ result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
+ //////////////////////////////////////////////////////////////////////\r
+ // Prepared_CCOFF_NORMED\r
\r
-void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
- DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
- static const caller_t callers[] = \r
- {\r
- 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>\r
- };\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(\r
+ int w, int h, float weight, \r
+ float templ_sum_scale, float templ_sqsum_scale,\r
+ const PtrStep<unsigned int> image_sum, \r
+ const PtrStep<unsigned long long> image_sqsum,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
-}\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float ccorr = result.ptr(y)[x];\r
+ float image_sum_ = (float)(\r
+ (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -\r
+ (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));\r
+ float image_sqsum_ = (float)(\r
+ (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -\r
+ (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));\r
+ result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,\r
+ sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));\r
+ }\r
+ }\r
\r
-//////////////////////////////////////////////////////////////////////\r
-// Prepared_CCOFF\r
+ void matchTemplatePrepared_CCOFF_NORMED_8U(\r
+ int w, int h, const DevMem2D_<unsigned int> image_sum, \r
+ const DevMem2D_<unsigned long long> image_sqsum,\r
+ unsigned int templ_sum, unsigned int templ_sqsum,\r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
-__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ float weight = 1.f / (w * h);\r
+ float templ_sum_scale = templ_sum * weight;\r
+ float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_ = (float)(\r
- (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -\r
- (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(\r
+ w, h, weight, templ_sum_scale, templ_sqsum_scale, \r
+ image_sum, image_sqsum, result);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
\r
\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(\r
+ int w, int h, float weight, \r
+ float templ_sum_scale_r, float templ_sum_scale_g, \r
+ float templ_sqsum_scale,\r
+ const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
+ const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(\r
- int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,\r
- const PtrStep<unsigned int> image_sum_r,\r
- const PtrStep<unsigned int> image_sum_g,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sqsum_r_ = (float)(\r
+ (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
+ (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float image_sqsum_g_ = (float)(\r
+ (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
+ (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
+\r
+ float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r\r
+ - image_sum_g_ * templ_sum_scale_g;\r
+ float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
+ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));\r
+ result.ptr(y)[x] = normAcc(num, denum);\r
+ }\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
- - image_sum_g_ * templ_sum_scale_g;\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_8UC2(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g,\r
- unsigned int templ_sum_r, unsigned int templ_sum_g, \r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ float weight = 1.f / (w * h);\r
+ float templ_sum_scale_r = templ_sum_r * weight;\r
+ float templ_sum_scale_g = templ_sum_g * weight;\r
+ float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
+ + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;\r
+\r
+ matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(\r
+ w, h, weight, \r
+ templ_sum_scale_r, templ_sum_scale_g,\r
+ templ_sqsum_scale,\r
+ image_sum_r, image_sqsum_r, \r
+ image_sum_g, image_sqsum_g, \r
+ result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(\r
- w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),\r
- image_sum_r, image_sum_g, result);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(\r
+ int w, int h, float weight, \r
+ float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
+ float templ_sqsum_scale,\r
+ const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
+ const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
+ const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sqsum_r_ = (float)(\r
+ (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
+ (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float image_sqsum_g_ = (float)(\r
+ (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
+ (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
+ float image_sum_b_ = (float)(\r
+ (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
+ (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
+ float image_sqsum_b_ = (float)(\r
+ (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -\r
+ (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));\r
+\r
+ float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r\r
+ - image_sum_g_ * templ_sum_scale_g\r
+ - image_sum_b_ * templ_sum_scale_b;\r
+ float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
+ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_\r
+ + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));\r
+ result.ptr(y)[x] = normAcc(num, denum);\r
+ }\r
+ }\r
\r
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(\r
- int w, int h, \r
- float templ_sum_scale_r,\r
- float templ_sum_scale_g,\r
- float templ_sum_scale_b,\r
- const PtrStep<unsigned int> image_sum_r,\r
- const PtrStep<unsigned int> image_sum_g,\r
- const PtrStep<unsigned int> image_sum_b,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ float weight = 1.f / (w * h);\r
+ float templ_sum_scale_r = templ_sum_r * weight;\r
+ float templ_sum_scale_g = templ_sum_g * weight;\r
+ float templ_sum_scale_b = templ_sum_b * weight;\r
+ float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
+ + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
+ + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;\r
+\r
+ matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(\r
+ w, h, weight, \r
+ templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, \r
+ templ_sqsum_scale, \r
+ image_sum_r, image_sqsum_r, \r
+ image_sum_g, image_sqsum_g, \r
+ image_sum_b, image_sqsum_b, \r
+ result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float image_sum_b_ = (float)(\r
- (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
- (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r\r
- - image_sum_g_ * templ_sum_scale_g\r
- - image_sum_b_ * templ_sum_scale_b;\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_8UC3(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g,\r
- const DevMem2D_<unsigned int> image_sum_b,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-\r
- matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(\r
- w, h, \r
- (float)templ_sum_r / (w * h),\r
- (float)templ_sum_g / (w * h),\r
- (float)templ_sum_b / (w * h),\r
- image_sum_r, image_sum_g, image_sum_b, result);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-\r
-\r
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(\r
- int w, int h, \r
- float templ_sum_scale_r, \r
- float templ_sum_scale_g,\r
- float templ_sum_scale_b,\r
- float templ_sum_scale_a,\r
- const PtrStep<unsigned int> image_sum_r,\r
- const PtrStep<unsigned int> image_sum_g,\r
- const PtrStep<unsigned int> image_sum_b,\r
- const PtrStep<unsigned int> image_sum_a,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float image_sum_b_ = (float)(\r
- (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
- (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
- float image_sum_a_ = (float)(\r
- (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -\r
- (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));\r
- float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r \r
- - image_sum_g_ * templ_sum_scale_g\r
- - image_sum_b_ * templ_sum_scale_b\r
- - image_sum_a_ * templ_sum_scale_a;\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_8UC4(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g,\r
- const DevMem2D_<unsigned int> image_sum_b,\r
- const DevMem2D_<unsigned int> image_sum_a,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
- unsigned int templ_sum_a, \r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-\r
- matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(\r
- w, h, \r
- (float)templ_sum_r / (w * h), \r
- (float)templ_sum_g / (w * h), \r
- (float)templ_sum_b / (w * h),\r
- (float)templ_sum_a / (w * h),\r
- image_sum_r, image_sum_g, image_sum_b, image_sum_a,\r
- result);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-//////////////////////////////////////////////////////////////////////\r
-// Prepared_CCOFF_NORMED\r
-\r
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(\r
- int w, int h, float weight, \r
- float templ_sum_scale, float templ_sqsum_scale,\r
- const PtrStep<unsigned int> image_sum, \r
- const PtrStep<unsigned long long> image_sqsum,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float ccorr = result.ptr(y)[x];\r
- float image_sum_ = (float)(\r
- (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -\r
- (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));\r
- float image_sqsum_ = (float)(\r
- (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -\r
- (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));\r
- result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,\r
- sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_NORMED_8U(\r
- int w, int h, const DevMem2D_<unsigned int> image_sum, \r
- const DevMem2D_<unsigned long long> image_sqsum,\r
- unsigned int templ_sum, unsigned int templ_sqsum,\r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(\r
+ int w, int h, float weight, \r
+ float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
+ float templ_sum_scale_a, float templ_sqsum_scale,\r
+ const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
+ const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
+ const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,\r
+ const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,\r
+ DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- float weight = 1.f / (w * h);\r
- float templ_sum_scale = templ_sum * weight;\r
- float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sum_r_ = (float)(\r
+ (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
+ (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
+ float image_sqsum_r_ = (float)(\r
+ (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
+ (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
+ float image_sum_g_ = (float)(\r
+ (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
+ (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
+ float image_sqsum_g_ = (float)(\r
+ (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
+ (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
+ float image_sum_b_ = (float)(\r
+ (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
+ (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
+ float image_sqsum_b_ = (float)(\r
+ (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -\r
+ (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));\r
+ float image_sum_a_ = (float)(\r
+ (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -\r
+ (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));\r
+ float image_sqsum_a_ = (float)(\r
+ (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -\r
+ (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));\r
+\r
+ float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g\r
+ - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;\r
+ float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
+ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_\r
+ + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_\r
+ + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));\r
+ result.ptr(y)[x] = normAcc(num, denum);\r
+ }\r
+ }\r
\r
- matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(\r
- w, h, weight, templ_sum_scale, templ_sqsum_scale, \r
- image_sum, image_sqsum, result);\r
- cudaSafeCall( cudaGetLastError() );\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
+ const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
+ unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
+ DevMem2Df result, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+ float weight = 1.f / (w * h);\r
+ float templ_sum_scale_r = templ_sum_r * weight;\r
+ float templ_sum_scale_g = templ_sum_g * weight;\r
+ float templ_sum_scale_b = templ_sum_b * weight;\r
+ float templ_sum_scale_a = templ_sum_a * weight;\r
+ float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r\r
+ + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
+ + templ_sqsum_b - weight * templ_sum_b * templ_sum_b\r
+ + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;\r
+\r
+ matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(\r
+ w, h, weight, \r
+ templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, \r
+ templ_sqsum_scale, \r
+ image_sum_r, image_sqsum_r, \r
+ image_sum_g, image_sqsum_g, \r
+ image_sum_b, image_sqsum_b, \r
+ image_sum_a, image_sqsum_a, \r
+ result);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ //////////////////////////////////////////////////////////////////////\r
+ // normalize\r
\r
+ template <int cn>\r
+ __global__ void normalizeKernel_8U(\r
+ int w, int h, const PtrStep<unsigned long long> image_sqsum, \r
+ unsigned int templ_sqsum, DevMem2Df result)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sqsum_ = (float)(\r
+ (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
+ (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
+ result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));\r
+ }\r
+ }\r
\r
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, \r
- float templ_sqsum_scale,\r
- const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
- const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
+ unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sqsum_r_ = (float)(\r
- (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
- (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float image_sqsum_g_ = (float)(\r
- (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
- (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
-\r
- float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r\r
- - image_sum_g_ * templ_sum_scale_g;\r
- float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
- + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));\r
- result.ptr(y)[x] = normAcc(num, denum);\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-\r
- float weight = 1.f / (w * h);\r
- float templ_sum_scale_r = templ_sum_r * weight;\r
- float templ_sum_scale_g = templ_sum_g * weight;\r
- float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
- + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;\r
-\r
- matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
- templ_sum_scale_r, templ_sum_scale_g,\r
- templ_sqsum_scale,\r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
- result);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-\r
-\r
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
- float templ_sqsum_scale,\r
- const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
- const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
- const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ break;\r
+ case 2:\r
+ normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ break;\r
+ case 3:\r
+ normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ break;\r
+ case 4:\r
+ normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+ break;\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sqsum_r_ = (float)(\r
- (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
- (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float image_sqsum_g_ = (float)(\r
- (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
- (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
- float image_sum_b_ = (float)(\r
- (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
- (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
- float image_sqsum_b_ = (float)(\r
- (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -\r
- (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));\r
-\r
- float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r\r
- - image_sum_g_ * templ_sum_scale_g\r
- - image_sum_b_ * templ_sum_scale_b;\r
- float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
- + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_\r
- + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));\r
- result.ptr(y)[x] = normAcc(num, denum);\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-\r
- float weight = 1.f / (w * h);\r
- float templ_sum_scale_r = templ_sum_r * weight;\r
- float templ_sum_scale_g = templ_sum_g * weight;\r
- float templ_sum_scale_b = templ_sum_b * weight;\r
- float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
- + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
- + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;\r
-\r
- matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
- templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, \r
- templ_sqsum_scale, \r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
- image_sum_b, image_sqsum_b, \r
- result);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-\r
-\r
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(\r
- int w, int h, float weight, \r
- float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
- float templ_sum_scale_a, float templ_sqsum_scale,\r
- const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,\r
- const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,\r
- const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,\r
- const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,\r
- DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sum_r_ = (float)(\r
- (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -\r
- (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));\r
- float image_sqsum_r_ = (float)(\r
- (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -\r
- (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));\r
- float image_sum_g_ = (float)(\r
- (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -\r
- (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));\r
- float image_sqsum_g_ = (float)(\r
- (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -\r
- (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));\r
- float image_sum_b_ = (float)(\r
- (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -\r
- (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));\r
- float image_sqsum_b_ = (float)(\r
- (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -\r
- (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));\r
- float image_sum_a_ = (float)(\r
- (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -\r
- (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));\r
- float image_sqsum_a_ = (float)(\r
- (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -\r
- (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));\r
-\r
- float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g\r
- - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;\r
- float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_\r
- + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_\r
- + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_\r
- + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));\r
- result.ptr(y)[x] = normAcc(num, denum);\r
- }\r
-}\r
-\r
-void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
- const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
- unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
- DevMem2Df result, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-\r
- float weight = 1.f / (w * h);\r
- float templ_sum_scale_r = templ_sum_r * weight;\r
- float templ_sum_scale_g = templ_sum_g * weight;\r
- float templ_sum_scale_b = templ_sum_b * weight;\r
- float templ_sum_scale_a = templ_sum_a * weight;\r
- float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r\r
- + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
- + templ_sqsum_b - weight * templ_sum_b * templ_sum_b\r
- + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;\r
-\r
- matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(\r
- w, h, weight, \r
- templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, \r
- templ_sqsum_scale, \r
- image_sum_r, image_sqsum_r, \r
- image_sum_g, image_sqsum_g, \r
- image_sum_b, image_sqsum_b, \r
- image_sum_a, image_sqsum_a, \r
- result);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-//////////////////////////////////////////////////////////////////////\r
-// normalize\r
-\r
-template <int cn>\r
-__global__ void normalizeKernel_8U(\r
- int w, int h, const PtrStep<unsigned long long> image_sqsum, \r
- unsigned int templ_sqsum, DevMem2Df result)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- float image_sqsum_ = (float)(\r
- (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -\r
- (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));\r
- result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));\r
- }\r
-}\r
-\r
-void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
- unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ //////////////////////////////////////////////////////////////////////\r
+ // extractFirstChannel\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- break;\r
- case 2:\r
- normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- break;\r
- case 3:\r
- normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- break;\r
- case 4:\r
- normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
- break;\r
- }\r
-\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-//////////////////////////////////////////////////////////////////////\r
-// extractFirstChannel\r
-\r
-template <int cn>\r
-__global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)\r
-{\r
- typedef typename TypeVec<float, cn>::vec_type Typef;\r
+ template <int cn>\r
+ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)\r
+ {\r
+ typedef typename TypeVec<float, cn>::vec_type Typef;\r
\r
- int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+ int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (x < result.cols && y < result.rows)\r
- {\r
- Typef val = ((const Typef*)image.ptr(y))[x];\r
- result.ptr(y)[x] = first(val);\r
- }\r
-}\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ Typef val = ((const Typef*)image.ptr(y))[x];\r
+ result.ptr(y)[x] = first(val);\r
+ }\r
+ }\r
\r
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);\r
- break;\r
- case 2:\r
- extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);\r
- break;\r
- case 3:\r
- extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);\r
- break;\r
- case 4:\r
- extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);\r
- break;\r
- }\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-} //namespace match_template\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);\r
+ break;\r
+ case 2:\r
+ extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);\r
+ break;\r
+ case 3:\r
+ extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);\r
+ break;\r
+ case 4:\r
+ extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ } //namespace match_template\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace mathfunc {\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////\r
-// Cart <-> Polar\r
-\r
-struct Nothing\r
-{\r
- static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)\r
- {\r
- }\r
-};\r
-struct Magnitude\r
-{\r
- static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
- {\r
- dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);\r
- }\r
-};\r
-struct MagnitudeSqr\r
-{\r
- static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
- {\r
- dst[y * dst_step + x] = x_data * x_data + y_data * y_data;\r
- }\r
-};\r
-struct Atan2\r
-{\r
- static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)\r
- {\r
- float angle = ::atan2f(y_data, x_data);\r
- angle += (angle < 0) * 2.0 * CV_PI;\r
- dst[y * dst_step + x] = scale * angle;\r
- }\r
-};\r
-template <typename Mag, typename Angle>\r
-__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, \r
- float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < width && y < height)\r
- {\r
- float x_data = xptr[y * x_step + x];\r
- float y_data = yptr[y * y_step + x];\r
-\r
- Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);\r
- Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);\r
- }\r
-}\r
-\r
-struct NonEmptyMag\r
+namespace cv { namespace gpu { namespace device \r
{\r
- static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)\r
+ namespace mathfunc \r
{\r
- return mag[y * mag_step + x];\r
- }\r
-};\r
-struct EmptyMag\r
-{\r
- static __device__ __forceinline__ float get(const float*, size_t, int, int)\r
- {\r
- return 1.0f;\r
- }\r
-};\r
-template <typename Mag>\r
-__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,\r
- float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < width && y < height)\r
- {\r
- float mag_data = Mag::get(mag, mag_step, x, y);\r
- float angle_data = angle[y * angle_step + x];\r
- float sin_a, cos_a;\r
-\r
- ::sincosf(scale * angle_data, &sin_a, &cos_a);\r
-\r
- xptr[y * x_step + x] = mag_data * cos_a;\r
- yptr[y * y_step + x] = mag_data * sin_a;\r
- }\r
-}\r
-\r
-template <typename Mag, typename Angle>\r
-void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ //////////////////////////////////////////////////////////////////////////////////////\r
+ // Cart <-> Polar\r
\r
- grid.x = divUp(x.cols, threads.x);\r
- grid.y = divUp(x.rows, threads.y);\r
- \r
- const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;\r
+ struct Nothing\r
+ {\r
+ static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)\r
+ {\r
+ }\r
+ };\r
+ struct Magnitude\r
+ {\r
+ static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
+ {\r
+ dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);\r
+ }\r
+ };\r
+ struct MagnitudeSqr\r
+ {\r
+ static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
+ {\r
+ dst[y * dst_step + x] = x_data * x_data + y_data * y_data;\r
+ }\r
+ };\r
+ struct Atan2\r
+ {\r
+ static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)\r
+ {\r
+ float angle = ::atan2f(y_data, x_data);\r
+ angle += (angle < 0) * 2.0 * CV_PI;\r
+ dst[y * dst_step + x] = scale * angle;\r
+ }\r
+ };\r
+ template <typename Mag, typename Angle>\r
+ __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, \r
+ float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
- x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
- mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (x < width && y < height)\r
+ {\r
+ float x_data = xptr[y * x_step + x];\r
+ float y_data = yptr[y * y_step + x];\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);\r
+ Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);\r
+ }\r
+ }\r
\r
-void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
- static const caller_t callers[2][2][2] = \r
- {\r
+ struct NonEmptyMag\r
{\r
+ static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)\r
{\r
- cartToPolar_caller<Magnitude, Atan2>,\r
- cartToPolar_caller<Magnitude, Nothing>\r
- },\r
- {\r
- cartToPolar_caller<MagnitudeSqr, Atan2>,\r
- cartToPolar_caller<MagnitudeSqr, Nothing>,\r
+ return mag[y * mag_step + x];\r
}\r
- },\r
+ };\r
+ struct EmptyMag\r
{\r
+ static __device__ __forceinline__ float get(const float*, size_t, int, int)\r
{\r
- cartToPolar_caller<Nothing, Atan2>,\r
- cartToPolar_caller<Nothing, Nothing>\r
- },\r
+ return 1.0f;\r
+ }\r
+ };\r
+ template <typename Mag>\r
+ __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,\r
+ float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < width && y < height)\r
{\r
- cartToPolar_caller<Nothing, Atan2>,\r
- cartToPolar_caller<Nothing, Nothing>,\r
+ float mag_data = Mag::get(mag, mag_step, x, y);\r
+ float angle_data = angle[y * angle_step + x];\r
+ float sin_a, cos_a;\r
+\r
+ ::sincosf(scale * angle_data, &sin_a, &cos_a);\r
+\r
+ xptr[y * x_step + x] = mag_data * cos_a;\r
+ yptr[y * y_step + x] = mag_data * sin_a;\r
}\r
}\r
- };\r
\r
- callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);\r
-}\r
+ template <typename Mag, typename Angle>\r
+ void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
-template <typename Mag>\r
-void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ grid.x = divUp(x.cols, threads.x);\r
+ grid.y = divUp(x.rows, threads.y);\r
+ \r
+ const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;\r
\r
- grid.x = divUp(mag.cols, threads.x);\r
- grid.y = divUp(mag.rows, threads.y);\r
- \r
- const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;\r
+ cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
+ x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
+ mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
- angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
+ static const caller_t callers[2][2][2] = \r
+ {\r
+ {\r
+ {\r
+ cartToPolar_caller<Magnitude, Atan2>,\r
+ cartToPolar_caller<Magnitude, Nothing>\r
+ },\r
+ {\r
+ cartToPolar_caller<MagnitudeSqr, Atan2>,\r
+ cartToPolar_caller<MagnitudeSqr, Nothing>,\r
+ }\r
+ },\r
+ {\r
+ {\r
+ cartToPolar_caller<Nothing, Atan2>,\r
+ cartToPolar_caller<Nothing, Nothing>\r
+ },\r
+ {\r
+ cartToPolar_caller<Nothing, Atan2>,\r
+ cartToPolar_caller<Nothing, Nothing>,\r
+ }\r
+ }\r
+ };\r
+\r
+ callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);\r
+ }\r
\r
-void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
- static const caller_t callers[2] = \r
- {\r
- polarToCart_caller<NonEmptyMag>,\r
- polarToCart_caller<EmptyMag>\r
- };\r
+ template <typename Mag>\r
+ void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
+\r
+ grid.x = divUp(mag.cols, threads.x);\r
+ grid.y = divUp(mag.rows, threads.y);\r
+ \r
+ const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;\r
+\r
+ polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
+ angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-} // namespace mathfunc\r
+ void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
+ static const caller_t callers[2] = \r
+ {\r
+ polarToCart_caller<NonEmptyMag>,\r
+ polarToCart_caller<EmptyMag>\r
+ };\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);\r
+ }\r
+ } // namespace mathfunc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/transform.hpp"\r
#include "opencv2/gpu/device/functional.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T> struct shift_and_sizeof;\r
-template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };\r
-template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };\r
-template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };\r
-template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };\r
-template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };\r
-template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };\r
-template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };\r
-\r
-///////////////////////////////////////////////////////////////////////////\r
-////////////////////////////////// CopyTo /////////////////////////////////\r
-///////////////////////////////////////////////////////////////////////////\r
-\r
-template<typename T>\r
-__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if ((x < cols * channels ) && (y < rows))\r
- if (mask[y * step_mask + x / channels] != 0)\r
- {\r
- size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;\r
- mat_dst[idx] = mat_src[idx];\r
- }\r
-}\r
-\r
-template<typename T>\r
-void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
-{\r
- dim3 threadsPerBlock(16,16, 1);\r
- dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);\r
-\r
- copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>\r
- ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall ( cudaDeviceSynchronize() );\r
-}\r
-\r
-void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
-{\r
- typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);\r
-\r
- static CopyToFunc tab[8] =\r
+ template <typename T> struct shift_and_sizeof;\r
+ template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };\r
+ template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };\r
+ template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };\r
+ template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };\r
+ template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };\r
+ template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };\r
+ template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };\r
+\r
+ ///////////////////////////////////////////////////////////////////////////\r
+ ////////////////////////////////// CopyTo /////////////////////////////////\r
+ ///////////////////////////////////////////////////////////////////////////\r
+\r
+ template<typename T>\r
+ __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)\r
{\r
- copy_to_with_mask_run<unsigned char>,\r
- copy_to_with_mask_run<signed char>,\r
- copy_to_with_mask_run<unsigned short>,\r
- copy_to_with_mask_run<short>,\r
- copy_to_with_mask_run<int>,\r
- copy_to_with_mask_run<float>,\r
- copy_to_with_mask_run<double>,\r
- 0\r
- };\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if ((x < cols * channels ) && (y < rows))\r
+ if (mask[y * step_mask + x / channels] != 0)\r
+ {\r
+ size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;\r
+ mat_dst[idx] = mat_src[idx];\r
+ }\r
+ }\r
\r
- CopyToFunc func = tab[depth];\r
+ template<typename T>\r
+ void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
+ {\r
+ dim3 threadsPerBlock(16,16, 1);\r
+ dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);\r
\r
- if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);\r
+ copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>\r
+ ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- func(mat_src, mat_dst, mask, channels, stream);\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaDeviceSynchronize() );\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////\r
-////////////////////////////////// SetTo //////////////////////////////////\r
-///////////////////////////////////////////////////////////////////////////\r
+ void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
+ {\r
+ typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);\r
\r
-__constant__ uchar scalar_8u[4];\r
-__constant__ schar scalar_8s[4];\r
-__constant__ ushort scalar_16u[4];\r
-__constant__ short scalar_16s[4];\r
-__constant__ int scalar_32s[4];\r
-__constant__ float scalar_32f[4]; \r
-__constant__ double scalar_64f[4];\r
+ static CopyToFunc tab[8] =\r
+ {\r
+ copy_to_with_mask_run<unsigned char>,\r
+ copy_to_with_mask_run<signed char>,\r
+ copy_to_with_mask_run<unsigned short>,\r
+ copy_to_with_mask_run<short>,\r
+ copy_to_with_mask_run<int>,\r
+ copy_to_with_mask_run<float>,\r
+ copy_to_with_mask_run<double>,\r
+ 0\r
+ };\r
\r
-template <typename T> __device__ __forceinline__ T readScalar(int i);\r
-template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}\r
-template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}\r
-template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}\r
-template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}\r
-template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}\r
-template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}\r
-template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}\r
+ CopyToFunc func = tab[depth];\r
\r
-void writeScalar(const uchar* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );\r
-}\r
-void writeScalar(const schar* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );\r
-}\r
-void writeScalar(const ushort* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );\r
-}\r
-void writeScalar(const short* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );\r
-}\r
-void writeScalar(const int* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );\r
-}\r
-void writeScalar(const float* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );\r
-}\r
-void writeScalar(const double* vals)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );\r
-}\r
+ if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);\r
\r
-template<typename T>\r
-__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)\r
-{\r
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ func(mat_src, mat_dst, mask, channels, stream);\r
+ }\r
\r
- if ((x < cols * channels ) && (y < rows))\r
+ ///////////////////////////////////////////////////////////////////////////\r
+ ////////////////////////////////// SetTo //////////////////////////////////\r
+ ///////////////////////////////////////////////////////////////////////////\r
+\r
+ __constant__ uchar scalar_8u[4];\r
+ __constant__ schar scalar_8s[4];\r
+ __constant__ ushort scalar_16u[4];\r
+ __constant__ short scalar_16s[4];\r
+ __constant__ int scalar_32s[4];\r
+ __constant__ float scalar_32f[4]; \r
+ __constant__ double scalar_64f[4];\r
+\r
+ template <typename T> __device__ __forceinline__ T readScalar(int i);\r
+ template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}\r
+ template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}\r
+ template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}\r
+ template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}\r
+ template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}\r
+ template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}\r
+ template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}\r
+\r
+ void writeScalar(const uchar* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );\r
+ }\r
+ void writeScalar(const schar* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );\r
+ }\r
+ void writeScalar(const ushort* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );\r
+ }\r
+ void writeScalar(const short* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );\r
+ }\r
+ void writeScalar(const int* vals)\r
{\r
- size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
- mat[idx] = readScalar<T>(x % channels);\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );\r
+ }\r
+ void writeScalar(const float* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );\r
+ }\r
+ void writeScalar(const double* vals)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );\r
}\r
-}\r
\r
-template<typename T>\r
-__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)\r
-{\r
- size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
- size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ template<typename T>\r
+ __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)\r
+ {\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if ((x < cols * channels ) && (y < rows))\r
- if (mask[y * step_mask + x / channels] != 0)\r
+ if ((x < cols * channels ) && (y < rows))\r
{\r
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
mat[idx] = readScalar<T>(x % channels);\r
}\r
-}\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)\r
-{\r
- writeScalar(scalar);\r
+ }\r
\r
- dim3 threadsPerBlock(32, 8, 1);\r
- dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+ template<typename T>\r
+ __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)\r
+ {\r
+ size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if ((x < cols * channels ) && (y < rows))\r
+ if (mask[y * step_mask + x / channels] != 0)\r
+ {\r
+ size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
+ mat[idx] = readScalar<T>(x % channels);\r
+ }\r
+ }\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)\r
+ {\r
+ writeScalar(scalar);\r
\r
- set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threadsPerBlock(32, 8, 1);\r
+ dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
\r
- if (stream == 0)\r
- cudaSafeCall ( cudaDeviceSynchronize() );\r
-}\r
+ set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)\r
-{\r
- writeScalar(scalar);\r
+ template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
\r
- dim3 threadsPerBlock(32, 8, 1);\r
- dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)\r
+ {\r
+ writeScalar(scalar);\r
\r
- set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threadsPerBlock(32, 8, 1);\r
+ dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
\r
- if (stream == 0)\r
- cudaSafeCall ( cudaDeviceSynchronize() );\r
-}\r
+ set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);\r
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);\r
+ if (stream == 0)\r
+ cudaSafeCall ( cudaDeviceSynchronize() );\r
+ }\r
\r
-///////////////////////////////////////////////////////////////////////////\r
-//////////////////////////////// ConvertTo ////////////////////////////////\r
-///////////////////////////////////////////////////////////////////////////\r
+ template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);\r
+ template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);\r
\r
-template <typename T, typename D> struct Convertor : unary_function<T, D>\r
-{\r
- Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}\r
+ ///////////////////////////////////////////////////////////////////////////\r
+ //////////////////////////////// ConvertTo ////////////////////////////////\r
+ ///////////////////////////////////////////////////////////////////////////\r
\r
- __device__ __forceinline__ D operator()(const T& src) const\r
+ template <typename T, typename D> struct Convertor : unary_function<T, D>\r
{\r
- return saturate_cast<D>(alpha * src + beta);\r
- }\r
+ Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}\r
\r
- const double alpha, beta;\r
-};\r
+ __device__ __forceinline__ D operator()(const T& src) const\r
+ {\r
+ return saturate_cast<D>(alpha * src + beta);\r
+ }\r
\r
-namespace detail\r
-{\r
- template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>\r
- {\r
- };\r
- template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_shift = 8 };\r
- };\r
- template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_shift = 4 };\r
- };\r
- template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
+ const double alpha, beta;\r
};\r
\r
- template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>\r
+ namespace detail\r
{\r
- enum { smart_shift = 4 };\r
- };\r
- template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_shift = 2 };\r
- };\r
+ template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>\r
+ {\r
+ };\r
+ template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 8 };\r
+ };\r
+ template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
\r
- template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 4 };\r
- };\r
- template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>\r
- {\r
- enum { smart_block_dim_y = 8 };\r
- enum { smart_shift = 2 };\r
- };\r
+ template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_shift = 2 };\r
+ };\r
+\r
+ template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 4 };\r
+ };\r
+ template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>\r
+ {\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 2 };\r
+ };\r
+\r
+ template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>\r
+ {\r
+ };\r
+ }\r
\r
- template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>\r
+ template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >\r
{\r
};\r
-}\r
-\r
-template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >\r
-{\r
-};\r
- \r
-template<typename T, typename D>\r
-void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)\r
-{\r
- cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
- cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
- Convertor<T, D> op(alpha, beta);\r
- OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);\r
-}\r
-\r
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, \r
- cudaStream_t stream = 0)\r
-{\r
- typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, \r
- cudaStream_t stream);\r
+ \r
+ template<typename T, typename D>\r
+ void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)\r
+ {\r
+ cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
+ cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
+ Convertor<T, D> op(alpha, beta);\r
+ ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);\r
+ }\r
\r
- static const caller_t tab[8][8] =\r
+ void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, \r
+ cudaStream_t stream = 0)\r
{\r
- {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,\r
- cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},\r
+ typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, \r
+ cudaStream_t stream);\r
\r
- {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,\r
- cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},\r
+ static const caller_t tab[8][8] =\r
+ {\r
+ {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,\r
+ cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},\r
\r
- {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,\r
- cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},\r
+ {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,\r
+ cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},\r
\r
- {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,\r
- cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},\r
+ {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,\r
+ cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},\r
\r
- {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,\r
- cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},\r
+ {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,\r
+ cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},\r
\r
- {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,\r
- cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},\r
+ {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,\r
+ cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},\r
\r
- {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,\r
- cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},\r
+ {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,\r
+ cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},\r
\r
- {0,0,0,0,0,0,0,0}\r
- };\r
+ {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,\r
+ cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},\r
\r
- caller_t func = tab[sdepth][ddepth];\r
- if (!func)\r
- cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);\r
+ {0,0,0,0,0,0,0,0}\r
+ };\r
\r
- func(src, dst, alpha, beta, stream);\r
-}\r
+ caller_t func = tab[sdepth][ddepth];\r
+ if (!func)\r
+ cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ func(src, dst, alpha, beta, stream);\r
+ }\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/vec_math.hpp"\r
#include "opencv2/gpu/device/transform.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace matrix_reductions {\r
-\r
-// Performs reduction in shared memory\r
-template <int size, typename T>\r
-__device__ void sumInSmem(volatile T* data, const uint tid)\r
-{\r
- T sum = data[tid];\r
-\r
- if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }\r
- if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }\r
- if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }\r
-\r
- if (tid < 32)\r
- {\r
- if (size >= 64) data[tid] = sum = sum + data[tid + 32];\r
- if (size >= 32) data[tid] = sum = sum + data[tid + 16];\r
- if (size >= 16) data[tid] = sum = sum + data[tid + 8];\r
- if (size >= 8) data[tid] = sum = sum + data[tid + 4];\r
- if (size >= 4) data[tid] = sum = sum + data[tid + 2];\r
- if (size >= 2) data[tid] = sum = sum + data[tid + 1];\r
- }\r
-}\r
-\r
-struct Mask8U\r
-{\r
- explicit Mask8U(PtrStepb mask): mask(mask) {}\r
-\r
- __device__ __forceinline__ bool operator()(int y, int x) const \r
- { \r
- return mask.ptr(y)[x]; \r
- }\r
-\r
- PtrStepb mask;\r
-};\r
-\r
-struct MaskTrue \r
-{ \r
- __device__ __forceinline__ bool operator()(int y, int x) const \r
- { \r
- return true; \r
- } \r
-};\r
-\r
-//////////////////////////////////////////////////////////////////////////////\r
-// Min max\r
-\r
-// To avoid shared bank conflicts we convert each value into value of \r
-// appropriate type (32 bits minimum)\r
-template <typename T> struct MinMaxTypeTraits {};\r
-template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };\r
-template <> struct MinMaxTypeTraits<char> { typedef int best_type; };\r
-template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };\r
-template <> struct MinMaxTypeTraits<short> { typedef int best_type; };\r
-template <> struct MinMaxTypeTraits<int> { typedef int best_type; };\r
-template <> struct MinMaxTypeTraits<float> { typedef float best_type; };\r
-template <> struct MinMaxTypeTraits<double> { typedef double best_type; };\r
-\r
-namespace minmax \r
+namespace cv { namespace gpu { namespace device \r
{\r
- __constant__ int ctwidth;\r
- __constant__ int ctheight;\r
-\r
- // Global counter of blocks finished its work\r
- __device__ uint blocks_finished = 0;\r
-\r
-\r
- // Estimates good thread configuration\r
- // - threads variable satisfies to threads.x * threads.y == 256\r
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
- {\r
- threads = dim3(32, 8);\r
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
- grid.x = std::min(grid.x, threads.x);\r
- grid.y = std::min(grid.y, threads.y);\r
- }\r
-\r
-\r
- // Returns required buffer sizes\r
- void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(cols, rows, threads, grid);\r
- bufcols = grid.x * grid.y * elem_size; \r
- bufrows = 2;\r
- }\r
-\r
-\r
- // Estimates device constants which are used in the kernels using specified thread configuration\r
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
- int twidth = divUp(divUp(cols, grid.x), threads.x);\r
- int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
- } \r
-\r
-\r
- // Does min and max in shared memory\r
- template <typename T>\r
- __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)\r
+ namespace matrix_reductions \r
{\r
- minval[tid] = ::min(minval[tid], minval[tid + offset]);\r
- maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);\r
- }\r
-\r
-\r
- template <int size, typename T>\r
- __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)\r
- {\r
- if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }\r
- if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }\r
- if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }\r
-\r
- if (tid < 32)\r
+ // Performs reduction in shared memory\r
+ template <int size, typename T>\r
+ __device__ void sumInSmem(volatile T* data, const uint tid)\r
{\r
- if (size >= 64) merge(tid, 32, minval, maxval);\r
- if (size >= 32) merge(tid, 16, minval, maxval);\r
- if (size >= 16) merge(tid, 8, minval, maxval);\r
- if (size >= 8) merge(tid, 4, minval, maxval);\r
- if (size >= 4) merge(tid, 2, minval, maxval);\r
- if (size >= 2) merge(tid, 1, minval, maxval);\r
- }\r
- }\r
+ T sum = data[tid];\r
\r
+ if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }\r
\r
- template <int nthreads, typename T, typename Mask>\r
- __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval)\r
- {\r
- typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
- __shared__ best_type sminval[nthreads];\r
- __shared__ best_type smaxval[nthreads];\r
-\r
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- T mymin = numeric_limits<T>::max();\r
- T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();\r
- uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
- uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
- for (uint y = y0; y < y_end; y += blockDim.y)\r
- {\r
- const T* src_row = (const T*)src.ptr(y);\r
- for (uint x = x0; x < x_end; x += blockDim.x)\r
- {\r
- T val = src_row[x];\r
- if (mask(y, x)) \r
- { \r
- mymin = ::min(mymin, val); \r
- mymax = ::max(mymax, val); \r
- }\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) data[tid] = sum = sum + data[tid + 32];\r
+ if (size >= 32) data[tid] = sum = sum + data[tid + 16];\r
+ if (size >= 16) data[tid] = sum = sum + data[tid + 8];\r
+ if (size >= 8) data[tid] = sum = sum + data[tid + 4];\r
+ if (size >= 4) data[tid] = sum = sum + data[tid + 2];\r
+ if (size >= 2) data[tid] = sum = sum + data[tid + 1];\r
}\r
}\r
\r
- sminval[tid] = mymin;\r
- smaxval[tid] = mymax;\r
- __syncthreads();\r
-\r
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
-\r
- if (tid == 0) \r
+ struct Mask8U\r
{\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
- }\r
-\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
+ explicit Mask8U(PtrStepb mask): mask(mask) {}\r
\r
- if (tid == 0)\r
- {\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
- __threadfence();\r
+ __device__ __forceinline__ bool operator()(int y, int x) const \r
+ { \r
+ return mask.ptr(y)[x]; \r
+ }\r
\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = ticket == gridDim.x * gridDim.y - 1;\r
- }\r
+ PtrStepb mask;\r
+ };\r
+\r
+ struct MaskTrue \r
+ { \r
+ __device__ __forceinline__ bool operator()(int y, int x) const \r
+ { \r
+ return true; \r
+ } \r
+ };\r
+\r
+ //////////////////////////////////////////////////////////////////////////////\r
+ // Min max\r
+\r
+ // To avoid shared bank conflicts we convert each value into value of \r
+ // appropriate type (32 bits minimum)\r
+ template <typename T> struct MinMaxTypeTraits {};\r
+ template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };\r
+ template <> struct MinMaxTypeTraits<char> { typedef int best_type; };\r
+ template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };\r
+ template <> struct MinMaxTypeTraits<short> { typedef int best_type; };\r
+ template <> struct MinMaxTypeTraits<int> { typedef int best_type; };\r
+ template <> struct MinMaxTypeTraits<float> { typedef float best_type; };\r
+ template <> struct MinMaxTypeTraits<double> { typedef double best_type; };\r
+\r
+ namespace minmax \r
+ {\r
+ __constant__ int ctwidth;\r
+ __constant__ int ctheight;\r
\r
- __syncthreads();\r
+ // Global counter of blocks finished its work\r
+ __device__ uint blocks_finished = 0;\r
\r
- if (is_last)\r
- {\r
- uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
\r
- sminval[tid] = minval[idx];\r
- smaxval[tid] = maxval[idx];\r
- __syncthreads();\r
+ // Estimates good thread configuration\r
+ // - threads variable satisfies to threads.x * threads.y == 256\r
+ void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
+ {\r
+ threads = dim3(32, 8);\r
+ grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
+ grid.x = std::min(grid.x, threads.x);\r
+ grid.y = std::min(grid.y, threads.y);\r
+ }\r
\r
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- if (tid == 0) \r
+ // Returns required buffer sizes\r
+ void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)\r
{\r
- minval[0] = (T)sminval[0];\r
- maxval[0] = (T)smaxval[0];\r
- blocks_finished = 0;\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(cols, rows, threads, grid);\r
+ bufcols = grid.x * grid.y * elem_size; \r
+ bufrows = 2;\r
}\r
- }\r
-#else\r
- if (tid == 0) \r
- {\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
- }\r
-#endif\r
- }\r
\r
- \r
- template <typename T>\r
- void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- T* minval_buf = (T*)buf.ptr(0);\r
- T* maxval_buf = (T*)buf.ptr(1);\r
+ // Estimates device constants which are used in the kernels using specified thread configuration\r
+ void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
+ { \r
+ int twidth = divUp(divUp(cols, grid.x), threads.x);\r
+ int theight = divUp(divUp(rows, grid.y), threads.y);\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
+ } \r
\r
- minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ // Does min and max in shared memory\r
+ template <typename T>\r
+ __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)\r
+ {\r
+ minval[tid] = ::min(minval[tid], minval[tid + offset]);\r
+ maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);\r
+ }\r
\r
- T minval_, maxval_;\r
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- *minval = minval_;\r
- *maxval = maxval_;\r
- } \r
\r
- template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template <int size, typename T>\r
+ __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)\r
+ {\r
+ if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }\r
\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) merge(tid, 32, minval, maxval);\r
+ if (size >= 32) merge(tid, 16, minval, maxval);\r
+ if (size >= 16) merge(tid, 8, minval, maxval);\r
+ if (size >= 8) merge(tid, 4, minval, maxval);\r
+ if (size >= 4) merge(tid, 2, minval, maxval);\r
+ if (size >= 2) merge(tid, 1, minval, maxval);\r
+ }\r
+ }\r
\r
- template <typename T>\r
- void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- T* minval_buf = (T*)buf.ptr(0);\r
- T* maxval_buf = (T*)buf.ptr(1);\r
+ template <int nthreads, typename T, typename Mask>\r
+ __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval)\r
+ {\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+\r
+ uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ T mymin = numeric_limits<T>::max();\r
+ T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();\r
+ uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
+ uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
+ for (uint y = y0; y < y_end; y += blockDim.y)\r
+ {\r
+ const T* src_row = (const T*)src.ptr(y);\r
+ for (uint x = x0; x < x_end; x += blockDim.x)\r
+ {\r
+ T val = src_row[x];\r
+ if (mask(y, x)) \r
+ { \r
+ mymin = ::min(mymin, val); \r
+ mymax = ::max(mymax, val); \r
+ }\r
+ }\r
+ }\r
\r
- minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
+ sminval[tid] = mymin;\r
+ smaxval[tid] = mymax;\r
+ __syncthreads();\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- T minval_, maxval_;\r
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- *minval = minval_;\r
- *maxval = maxval_;\r
- } \r
+ if (tid == 0) \r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ }\r
\r
- template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxCaller<short>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxCaller<int>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxCaller<float>(const DevMem2Db, double*,double*, PtrStepb);\r
- template void minMaxCaller<double>(const DevMem2Db, double*, double*, PtrStepb);\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
+ if (tid == 0)\r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ __threadfence();\r
\r
- template <int nthreads, typename T>\r
- __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)\r
- {\r
- typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
- __shared__ best_type sminval[nthreads];\r
- __shared__ best_type smaxval[nthreads];\r
- \r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- uint idx = ::min(tid, size - 1);\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
\r
- sminval[tid] = minval[idx];\r
- smaxval[tid] = maxval[idx];\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
+ if (is_last)\r
+ {\r
+ uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
\r
- if (tid == 0) \r
- {\r
- minval[0] = (T)sminval[0];\r
- maxval[0] = (T)smaxval[0];\r
- }\r
- }\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ __syncthreads();\r
\r
+ findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
- template <typename T>\r
- void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ if (tid == 0) \r
+ {\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) \r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ }\r
+ #endif\r
+ }\r
\r
- T* minval_buf = (T*)buf.ptr(0);\r
- T* maxval_buf = (T*)buf.ptr(1);\r
+ \r
+ template <typename T>\r
+ void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
- minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ T* minval_buf = (T*)buf.ptr(0);\r
+ T* maxval_buf = (T*)buf.ptr(1);\r
\r
- cudaSafeCall(cudaDeviceSynchronize());\r
+ minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- T minval_, maxval_;\r
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- *minval = minval_;\r
- *maxval = maxval_;\r
- }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- template void minMaxMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
- template void minMaxMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ T minval_, maxval_;\r
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+ } \r
\r
+ template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
\r
- template <typename T>\r
- void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
-\r
- T* minval_buf = (T*)buf.ptr(0);\r
- T* maxval_buf = (T*)buf.ptr(1);\r
-\r
- minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
- minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- T minval_, maxval_;\r
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- *minval = minval_;\r
- *maxval = maxval_;\r
- }\r
-\r
- template void minMaxMultipassCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxMultipassCaller<char>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxMultipassCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);\r
- template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);\r
-} // namespace minmax\r
-\r
-///////////////////////////////////////////////////////////////////////////////\r
-// minMaxLoc\r
-\r
-namespace minmaxloc \r
-{\r
- __constant__ int ctwidth;\r
- __constant__ int ctheight;\r
\r
- // Global counter of blocks finished its work\r
- __device__ uint blocks_finished = 0;\r
+ template <typename T>\r
+ void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
+ T* minval_buf = (T*)buf.ptr(0);\r
+ T* maxval_buf = (T*)buf.ptr(1);\r
\r
- // Estimates good thread configuration\r
- // - threads variable satisfies to threads.x * threads.y == 256\r
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
- {\r
- threads = dim3(32, 8);\r
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
- grid.x = std::min(grid.x, threads.x);\r
- grid.y = std::min(grid.y, threads.y);\r
- }\r
+ minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- // Returns required buffer sizes\r
- void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
- int& b1rows, int& b2cols, int& b2rows)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(cols, rows, threads, grid);\r
- b1cols = grid.x * grid.y * elem_size; // For values\r
- b1rows = 2;\r
- b2cols = grid.x * grid.y * sizeof(int); // For locations\r
- b2rows = 2;\r
- }\r
-\r
-\r
- // Estimates device constants which are used in the kernels using specified thread configuration\r
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
- int twidth = divUp(divUp(cols, grid.x), threads.x);\r
- int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
- } \r
-\r
-\r
- template <typename T>\r
- __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, \r
- volatile uint* minloc, volatile uint* maxloc)\r
- {\r
- T val = minval[tid + offset];\r
- if (val < minval[tid])\r
- {\r
- minval[tid] = val;\r
- minloc[tid] = minloc[tid + offset];\r
- }\r
- val = maxval[tid + offset];\r
- if (val > maxval[tid])\r
- {\r
- maxval[tid] = val;\r
- maxloc[tid] = maxloc[tid + offset];\r
- }\r
- }\r
+ T minval_, maxval_;\r
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+ } \r
\r
+ template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxCaller<short>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxCaller<int>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxCaller<float>(const DevMem2Db, double*,double*, PtrStepb);\r
+ template void minMaxCaller<double>(const DevMem2Db, double*, double*, PtrStepb);\r
\r
- template <int size, typename T>\r
- __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, \r
- volatile uint* maxloc, const uint tid)\r
- {\r
- if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
- if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
- if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
\r
- if (tid < 32)\r
- {\r
- if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);\r
- if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);\r
- if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);\r
- if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);\r
- if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);\r
- if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);\r
- }\r
- }\r
+ template <int nthreads, typename T>\r
+ __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)\r
+ {\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+ \r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ uint idx = ::min(tid, size - 1);\r
\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ __syncthreads();\r
\r
- template <int nthreads, typename T, typename Mask>\r
- __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, \r
- uint* minloc, uint* maxloc)\r
- {\r
- typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
- __shared__ best_type sminval[nthreads];\r
- __shared__ best_type smaxval[nthreads];\r
- __shared__ uint sminloc[nthreads];\r
- __shared__ uint smaxloc[nthreads];\r
-\r
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- T mymin = numeric_limits<T>::max();\r
- T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); \r
- uint myminloc = 0;\r
- uint mymaxloc = 0;\r
- uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
- uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
-\r
- for (uint y = y0; y < y_end; y += blockDim.y)\r
- {\r
- const T* ptr = (const T*)src.ptr(y);\r
- for (uint x = x0; x < x_end; x += blockDim.x)\r
- {\r
- if (mask(y, x))\r
+ findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);\r
+\r
+ if (tid == 0) \r
{\r
- T val = ptr[x];\r
- if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }\r
- if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
}\r
}\r
- }\r
\r
- sminval[tid] = mymin; \r
- smaxval[tid] = mymax;\r
- sminloc[tid] = myminloc;\r
- smaxloc[tid] = mymaxloc;\r
- __syncthreads();\r
\r
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
-\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
-\r
- if (tid == 0)\r
- {\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
- minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
- maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
- __threadfence();\r
-\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = ticket == gridDim.x * gridDim.y - 1;\r
- }\r
-\r
- __syncthreads();\r
+ template <typename T>\r
+ void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- if (is_last)\r
- {\r
- uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
+ T* minval_buf = (T*)buf.ptr(0);\r
+ T* maxval_buf = (T*)buf.ptr(1);\r
\r
- sminval[tid] = minval[idx];\r
- smaxval[tid] = maxval[idx];\r
- sminloc[tid] = minloc[idx];\r
- smaxloc[tid] = maxloc[idx];\r
- __syncthreads();\r
+ minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
\r
- if (tid == 0) \r
- {\r
- minval[0] = (T)sminval[0];\r
- maxval[0] = (T)smaxval[0];\r
- minloc[0] = sminloc[0];\r
- maxloc[0] = smaxloc[0];\r
- blocks_finished = 0;\r
+ T minval_, maxval_;\r
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
}\r
- }\r
-#else\r
- if (tid == 0) \r
- {\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
- minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
- maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
- }\r
-#endif\r
- }\r
\r
+ template void minMaxMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
+ template void minMaxMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
\r
- template <typename T>\r
- void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
-\r
- T* minval_buf = (T*)valbuf.ptr(0);\r
- T* maxval_buf = (T*)valbuf.ptr(1);\r
- uint* minloc_buf = (uint*)locbuf.ptr(0);\r
- uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
-\r
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
- minloc_buf, maxloc_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- T minval_, maxval_;\r
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
- *minval = minval_;\r
- *maxval = maxval_;\r
-\r
- uint minloc_, maxloc_;\r
- cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );\r
- cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );\r
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
- }\r
-\r
- template void minMaxLocMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
-\r
-\r
- template <typename T>\r
- void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
-\r
- T* minval_buf = (T*)valbuf.ptr(0);\r
- T* maxval_buf = (T*)valbuf.ptr(1);\r
- uint* minloc_buf = (uint*)locbuf.ptr(0);\r
- uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
-\r
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
- minloc_buf, maxloc_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- T minval_, maxval_;\r
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- *minval = minval_;\r
- *maxval = maxval_;\r
-\r
- uint minloc_, maxloc_;\r
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
- }\r
-\r
- template void minMaxLocCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocCaller<double>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
-\r
-\r
- // This kernel will be used only when compute capability is 1.0\r
- template <int nthreads, typename T>\r
- __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)\r
- {\r
- typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
- __shared__ best_type sminval[nthreads];\r
- __shared__ best_type smaxval[nthreads];\r
- __shared__ uint sminloc[nthreads];\r
- __shared__ uint smaxloc[nthreads];\r
\r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- uint idx = ::min(tid, size - 1);\r
+ template <typename T>\r
+ void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- sminval[tid] = minval[idx];\r
- smaxval[tid] = maxval[idx];\r
- sminloc[tid] = minloc[idx];\r
- smaxloc[tid] = maxloc[idx];\r
- __syncthreads();\r
+ T* minval_buf = (T*)buf.ptr(0);\r
+ T* maxval_buf = (T*)buf.ptr(1);\r
\r
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+ minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (tid == 0) \r
- {\r
- minval[0] = (T)sminval[0];\r
- maxval[0] = (T)smaxval[0];\r
- minloc[0] = sminloc[0];\r
- maxloc[0] = smaxloc[0];\r
- }\r
- }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
+ T minval_, maxval_;\r
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+ }\r
\r
- template <typename T>\r
- void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
-\r
- T* minval_buf = (T*)valbuf.ptr(0);\r
- T* maxval_buf = (T*)valbuf.ptr(1);\r
- uint* minloc_buf = (uint*)locbuf.ptr(0);\r
- uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
-\r
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
- minloc_buf, maxloc_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
- minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- T minval_, maxval_;\r
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- *minval = minval_;\r
- *maxval = maxval_;\r
-\r
- uint minloc_, maxloc_;\r
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
- }\r
-\r
- template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
-\r
-\r
- template <typename T>\r
- void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
-\r
- T* minval_buf = (T*)valbuf.ptr(0);\r
- T* maxval_buf = (T*)valbuf.ptr(1);\r
- uint* minloc_buf = (uint*)locbuf.ptr(0);\r
- uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
-\r
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
- minloc_buf, maxloc_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
- minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- T minval_, maxval_;\r
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
- *minval = minval_;\r
- *maxval = maxval_;\r
-\r
- uint minloc_, maxloc_;\r
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
- }\r
-\r
- template void minMaxLocMultipassCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMultipassCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMultipassCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
- template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
-} // namespace minmaxloc\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////////////////////////\r
-// countNonZero\r
-\r
-namespace countnonzero \r
-{\r
- __constant__ int ctwidth;\r
- __constant__ int ctheight;\r
+ template void minMaxMultipassCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxMultipassCaller<char>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxMultipassCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);\r
+ template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);\r
+ } // namespace minmax\r
\r
- __device__ uint blocks_finished = 0;\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // minMaxLoc\r
\r
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
- {\r
- threads = dim3(32, 8);\r
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
- grid.x = std::min(grid.x, threads.x);\r
- grid.y = std::min(grid.y, threads.y);\r
- }\r
+ namespace minmaxloc \r
+ {\r
+ __constant__ int ctwidth;\r
+ __constant__ int ctheight;\r
\r
+ // Global counter of blocks finished its work\r
+ __device__ uint blocks_finished = 0;\r
\r
- void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(cols, rows, threads, grid);\r
- bufcols = grid.x * grid.y * sizeof(int);\r
- bufrows = 1;\r
- }\r
\r
+ // Estimates good thread configuration\r
+ // - threads variable satisfies to threads.x * threads.y == 256\r
+ void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
+ {\r
+ threads = dim3(32, 8);\r
+ grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
+ grid.x = std::min(grid.x, threads.x);\r
+ grid.y = std::min(grid.y, threads.y);\r
+ }\r
\r
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
- int twidth = divUp(divUp(cols, grid.x), threads.x);\r
- int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
- }\r
\r
+ // Returns required buffer sizes\r
+ void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
+ int& b1rows, int& b2cols, int& b2rows)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(cols, rows, threads, grid);\r
+ b1cols = grid.x * grid.y * elem_size; // For values\r
+ b1rows = 2;\r
+ b2cols = grid.x * grid.y * sizeof(int); // For locations\r
+ b2rows = 2;\r
+ }\r
\r
- template <int nthreads, typename T>\r
- __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count)\r
- {\r
- __shared__ uint scount[nthreads];\r
\r
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ // Estimates device constants which are used in the kernels using specified thread configuration\r
+ void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
+ { \r
+ int twidth = divUp(divUp(cols, grid.x), threads.x);\r
+ int theight = divUp(divUp(rows, grid.y), threads.y);\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); \r
+ } \r
\r
- uint cnt = 0;\r
- for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
- {\r
- const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
- for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
- cnt += ptr[x0 + x * blockDim.x] != 0;\r
- }\r
\r
- scount[tid] = cnt;\r
- __syncthreads();\r
+ template <typename T>\r
+ __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, \r
+ volatile uint* minloc, volatile uint* maxloc)\r
+ {\r
+ T val = minval[tid + offset];\r
+ if (val < minval[tid])\r
+ {\r
+ minval[tid] = val;\r
+ minloc[tid] = minloc[tid + offset];\r
+ }\r
+ val = maxval[tid + offset];\r
+ if (val > maxval[tid])\r
+ {\r
+ maxval[tid] = val;\r
+ maxloc[tid] = maxloc[tid + offset];\r
+ }\r
+ }\r
\r
- sumInSmem<nthreads, uint>(scount, tid);\r
\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
+ template <int size, typename T>\r
+ __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, \r
+ volatile uint* maxloc, const uint tid)\r
+ {\r
+ if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
\r
- if (tid == 0)\r
- {\r
- count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
- __threadfence();\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);\r
+ if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);\r
+ if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);\r
+ if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);\r
+ if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);\r
+ if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);\r
+ }\r
+ }\r
\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = ticket == gridDim.x * gridDim.y - 1;\r
- }\r
\r
- __syncthreads();\r
+ template <int nthreads, typename T, typename Mask>\r
+ __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, \r
+ uint* minloc, uint* maxloc)\r
+ {\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+ __shared__ uint sminloc[nthreads];\r
+ __shared__ uint smaxloc[nthreads];\r
+\r
+ uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ T mymin = numeric_limits<T>::max();\r
+ T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); \r
+ uint myminloc = 0;\r
+ uint mymaxloc = 0;\r
+ uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
+ uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
+\r
+ for (uint y = y0; y < y_end; y += blockDim.y)\r
+ {\r
+ const T* ptr = (const T*)src.ptr(y);\r
+ for (uint x = x0; x < x_end; x += blockDim.x)\r
+ {\r
+ if (mask(y, x))\r
+ {\r
+ T val = ptr[x];\r
+ if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }\r
+ if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }\r
+ }\r
+ }\r
+ }\r
\r
- if (is_last)\r
- {\r
- scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
- __syncthreads();\r
+ sminval[tid] = mymin; \r
+ smaxval[tid] = mymax;\r
+ sminloc[tid] = myminloc;\r
+ smaxloc[tid] = mymaxloc;\r
+ __syncthreads();\r
+\r
+ findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
+\r
+ if (tid == 0)\r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
+ maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
+ __threadfence();\r
+\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
+\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ sminloc[tid] = minloc[idx];\r
+ smaxloc[tid] = maxloc[idx];\r
+ __syncthreads();\r
+\r
+ findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
+ minloc[0] = sminloc[0];\r
+ maxloc[0] = smaxloc[0];\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) \r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
+ maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
+ }\r
+ #endif\r
+ }\r
\r
- sumInSmem<nthreads, uint>(scount, tid);\r
\r
- if (tid == 0) \r
+ template <typename T>\r
+ void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
{\r
- count[0] = scount[0];\r
- blocks_finished = 0;\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ T* minval_buf = (T*)valbuf.ptr(0);\r
+ T* maxval_buf = (T*)valbuf.ptr(1);\r
+ uint* minloc_buf = (uint*)locbuf.ptr(0);\r
+ uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
+\r
+ minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
+ minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+ T minval_, maxval_;\r
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+\r
+ uint minloc_, maxloc_;\r
+ cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );\r
+ cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );\r
+ minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
+ maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
}\r
- }\r
-#else\r
- if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
-#endif\r
- }\r
\r
- \r
- template <typename T>\r
- int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ template void minMaxLocMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
\r
- uint* count_buf = (uint*)buf.ptr(0);\r
\r
- countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ T* minval_buf = (T*)valbuf.ptr(0);\r
+ T* maxval_buf = (T*)valbuf.ptr(1);\r
+ uint* minloc_buf = (uint*)locbuf.ptr(0);\r
+ uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
+\r
+ minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
+ minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+ T minval_, maxval_;\r
+ cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+\r
+ uint minloc_, maxloc_;\r
+ cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
+ maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
+ }\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ template void minMaxLocCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocCaller<double>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
\r
- uint count;\r
- cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- \r
- return count;\r
- } \r
\r
- template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<ushort>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<short>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<int>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<float>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroCaller<double>(const DevMem2Db, PtrStepb);\r
+ // This kernel will be used only when compute capability is 1.0\r
+ template <int nthreads, typename T>\r
+ __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)\r
+ {\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+ __shared__ uint sminloc[nthreads];\r
+ __shared__ uint smaxloc[nthreads];\r
\r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ uint idx = ::min(tid, size - 1);\r
\r
- template <int nthreads, typename T>\r
- __global__ void countNonZeroPass2Kernel(uint* count, int size)\r
- {\r
- __shared__ uint scount[nthreads];\r
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ sminloc[tid] = minloc[idx];\r
+ smaxloc[tid] = maxloc[idx];\r
+ __syncthreads();\r
\r
- scount[tid] = tid < size ? count[tid] : 0;\r
- __syncthreads();\r
+ findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
\r
- sumInSmem<nthreads, uint>(scount, tid);\r
-\r
- if (tid == 0) \r
- count[0] = scount[0];\r
- }\r
+ if (tid == 0) \r
+ {\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
+ minloc[0] = sminloc[0];\r
+ maxloc[0] = smaxloc[0];\r
+ }\r
+ }\r
\r
\r
- template <typename T>\r
- int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ template <typename T>\r
+ void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ T* minval_buf = (T*)valbuf.ptr(0);\r
+ T* maxval_buf = (T*)valbuf.ptr(1);\r
+ uint* minloc_buf = (uint*)locbuf.ptr(0);\r
+ uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
+\r
+ minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, \r
+ minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+ T minval_, maxval_;\r
+ cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+\r
+ uint minloc_, maxloc_;\r
+ cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
+ maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
+ }\r
\r
- uint* count_buf = (uint*)buf.ptr(0);\r
+ template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
\r
- countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
- cudaSafeCall( cudaGetLastError() );\r
- countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ template <typename T>\r
+ void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ T* minval_buf = (T*)valbuf.ptr(0);\r
+ T* maxval_buf = (T*)valbuf.ptr(1);\r
+ uint* minloc_buf = (uint*)locbuf.ptr(0);\r
+ uint* maxloc_buf = (uint*)locbuf.ptr(1);\r
+\r
+ minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, \r
+ minloc_buf, maxloc_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+ T minval_, maxval_;\r
+ cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));\r
+ *minval = minval_;\r
+ *maxval = maxval_;\r
+\r
+ uint minloc_, maxloc_;\r
+ cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;\r
+ maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;\r
+ }\r
\r
- uint count;\r
- cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
- \r
- return count;\r
- } \r
+ template void minMaxLocMultipassCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMultipassCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMultipassCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
+ } // namespace minmaxloc\r
\r
- template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroMultipassCaller<ushort>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroMultipassCaller<short>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);\r
- template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+ // countNonZero\r
\r
-} // namespace countnonzero\r
+ namespace countnonzero \r
+ {\r
+ __constant__ int ctwidth;\r
+ __constant__ int ctheight;\r
\r
+ __device__ uint blocks_finished = 0;\r
\r
-//////////////////////////////////////////////////////////////////////////\r
-// Sum\r
+ void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
+ {\r
+ threads = dim3(32, 8);\r
+ grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
+ grid.x = std::min(grid.x, threads.x);\r
+ grid.y = std::min(grid.y, threads.y);\r
+ }\r
\r
-namespace sum\r
-{\r
- template <typename T> struct SumType {};\r
- template <> struct SumType<uchar> { typedef uint R; };\r
- template <> struct SumType<char> { typedef int R; };\r
- template <> struct SumType<ushort> { typedef uint R; };\r
- template <> struct SumType<short> { typedef int R; };\r
- template <> struct SumType<int> { typedef int R; };\r
- template <> struct SumType<float> { typedef float R; };\r
- template <> struct SumType<double> { typedef double R; };\r
\r
- template <typename R> \r
- struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };\r
+ void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(cols, rows, threads, grid);\r
+ bufcols = grid.x * grid.y * sizeof(int);\r
+ bufrows = 1;\r
+ }\r
\r
- template <typename R> \r
- struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };\r
\r
- template <>\r
- struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };\r
+ void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
+ { \r
+ int twidth = divUp(divUp(cols, grid.x), threads.x);\r
+ int theight = divUp(divUp(rows, grid.y), threads.y);\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
+ }\r
\r
- template <typename R> \r
- struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };\r
\r
- __constant__ int ctwidth;\r
- __constant__ int ctheight;\r
- __device__ uint blocks_finished = 0;\r
+ template <int nthreads, typename T>\r
+ __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count)\r
+ {\r
+ __shared__ uint scount[nthreads];\r
\r
- const int threads_x = 32;\r
- const int threads_y = 8;\r
+ uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
- {\r
- threads = dim3(threads_x, threads_y);\r
- grid = dim3(divUp(cols, threads.x * threads.y), \r
- divUp(rows, threads.y * threads.x));\r
- grid.x = std::min(grid.x, threads.x);\r
- grid.y = std::min(grid.y, threads.y);\r
- }\r
+ uint cnt = 0;\r
+ for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
+ for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ cnt += ptr[x0 + x * blockDim.x] != 0;\r
+ }\r
+\r
+ scount[tid] = cnt;\r
+ __syncthreads();\r
+\r
+ sumInSmem<nthreads, uint>(scount, tid);\r
+\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
+\r
+ if (tid == 0)\r
+ {\r
+ count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
+ __threadfence();\r
+\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
+ __syncthreads();\r
+\r
+ sumInSmem<nthreads, uint>(scount, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ count[0] = scount[0];\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
+ #endif\r
+ }\r
\r
+ \r
+ template <typename T>\r
+ int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)\r
- {\r
- dim3 threads, grid;\r
- estimateThreadCfg(cols, rows, threads, grid);\r
- bufcols = grid.x * grid.y * sizeof(double) * cn;\r
- bufrows = 1;\r
- }\r
-\r
-\r
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
- { \r
- int twidth = divUp(divUp(cols, grid.x), threads.x);\r
- int theight = divUp(divUp(rows, grid.y), threads.y);\r
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
- }\r
-\r
- template <typename T, typename R, typename Op, int nthreads>\r
- __global__ void sumKernel(const DevMem2Db src, R* result)\r
- {\r
- __shared__ R smem[nthreads];\r
+ uint* count_buf = (uint*)buf.ptr(0);\r
\r
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
+ countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- R sum = 0;\r
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
- {\r
- const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
- sum += Op::call(ptr[x0 + x * blockDim.x]);\r
- }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- smem[tid] = sum;\r
- __syncthreads();\r
+ uint count;\r
+ cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ \r
+ return count;\r
+ } \r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
+ template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<ushort>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<short>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<int>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<float>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroCaller<double>(const DevMem2Db, PtrStepb);\r
\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
\r
- if (tid == 0)\r
- {\r
- result[bid] = smem[0];\r
- __threadfence();\r
+ template <int nthreads, typename T>\r
+ __global__ void countNonZeroPass2Kernel(uint* count, int size)\r
+ {\r
+ __shared__ uint scount[nthreads];\r
+ uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = (ticket == gridDim.x * gridDim.y - 1);\r
- }\r
+ scount[tid] = tid < size ? count[tid] : 0;\r
+ __syncthreads();\r
\r
- __syncthreads();\r
+ sumInSmem<nthreads, uint>(scount, tid);\r
\r
- if (is_last)\r
- {\r
- smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;\r
- __syncthreads();\r
+ if (tid == 0) \r
+ count[0] = scount[0];\r
+ }\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
\r
- if (tid == 0) \r
+ template <typename T>\r
+ int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf)\r
{\r
- result[0] = smem[0];\r
- blocks_finished = 0;\r
- }\r
- }\r
-#else\r
- if (tid == 0) result[bid] = smem[0];\r
-#endif\r
- }\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
+ uint* count_buf = (uint*)buf.ptr(0);\r
\r
- template <typename T, typename R, int nthreads>\r
- __global__ void sumPass2Kernel(R* result, int size)\r
- {\r
- __shared__ R smem[nthreads];\r
- int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- smem[tid] = tid < size ? result[tid] : 0;\r
- __syncthreads();\r
+ countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- if (tid == 0) \r
- result[0] = smem[0];\r
- }\r
+ uint count;\r
+ cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ \r
+ return count;\r
+ } \r
\r
+ template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroMultipassCaller<ushort>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroMultipassCaller<short>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);\r
+ template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);\r
\r
- template <typename T, typename R, typename Op, int nthreads>\r
- __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec<R, 2>::vec_type* result)\r
- {\r
- typedef typename TypeVec<T, 2>::vec_type SrcType;\r
- typedef typename TypeVec<R, 2>::vec_type DstType;\r
+ } // namespace countnonzero\r
\r
- __shared__ R smem[nthreads * 2];\r
\r
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // Sum\r
\r
- SrcType val;\r
- DstType sum = VecTraits<DstType>::all(0);\r
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ namespace sum\r
{\r
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
- {\r
- val = ptr[x0 + x * blockDim.x];\r
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));\r
- }\r
- }\r
+ template <typename T> struct SumType {};\r
+ template <> struct SumType<uchar> { typedef uint R; };\r
+ template <> struct SumType<char> { typedef int R; };\r
+ template <> struct SumType<ushort> { typedef uint R; };\r
+ template <> struct SumType<short> { typedef int R; };\r
+ template <> struct SumType<int> { typedef int R; };\r
+ template <> struct SumType<float> { typedef float R; };\r
+ template <> struct SumType<double> { typedef double R; };\r
\r
- smem[tid] = sum.x;\r
- smem[tid + nthreads] = sum.y;\r
- __syncthreads();\r
+ template <typename R> \r
+ struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ template <typename R> \r
+ struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };\r
\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
+ template <>\r
+ struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };\r
\r
- if (tid == 0)\r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- result[bid] = res;\r
- __threadfence();\r
-\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = (ticket == gridDim.x * gridDim.y - 1);\r
- }\r
-\r
- __syncthreads();\r
+ template <typename R> \r
+ struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };\r
\r
- if (is_last)\r
- {\r
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- __syncthreads();\r
+ __constant__ int ctwidth;\r
+ __constant__ int ctheight;\r
+ __device__ uint blocks_finished = 0;\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ const int threads_x = 32;\r
+ const int threads_y = 8;\r
\r
- if (tid == 0) \r
+ void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)\r
{\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- result[0] = res;\r
- blocks_finished = 0;\r
+ threads = dim3(threads_x, threads_y);\r
+ grid = dim3(divUp(cols, threads.x * threads.y), \r
+ divUp(rows, threads.y * threads.x));\r
+ grid.x = std::min(grid.x, threads.x);\r
+ grid.y = std::min(grid.y, threads.y);\r
}\r
- }\r
-#else\r
- if (tid == 0) \r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- result[bid] = res;\r
- }\r
-#endif\r
- }\r
\r
\r
- template <typename T, typename R, int nthreads>\r
- __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)\r
- {\r
- typedef typename TypeVec<R, 2>::vec_type DstType;\r
+ void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)\r
+ {\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(cols, rows, threads, grid);\r
+ bufcols = grid.x * grid.y * sizeof(double) * cn;\r
+ bufrows = 1;\r
+ }\r
\r
- __shared__ R smem[nthreads * 2];\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)\r
+ { \r
+ int twidth = divUp(divUp(cols, grid.x), threads.x);\r
+ int theight = divUp(divUp(rows, grid.y), threads.y);\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
+ }\r
\r
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- __syncthreads();\r
+ template <typename T, typename R, typename Op, int nthreads>\r
+ __global__ void sumKernel(const DevMem2Db src, R* result)\r
+ {\r
+ __shared__ R smem[nthreads];\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
\r
- if (tid == 0) \r
- {\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- result[0] = res;\r
- }\r
- }\r
+ R sum = 0;\r
+ for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
+ for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ sum += Op::call(ptr[x0 + x * blockDim.x]);\r
+ }\r
\r
+ smem[tid] = sum;\r
+ __syncthreads();\r
\r
- template <typename T, typename R, typename Op, int nthreads>\r
- __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec<R, 3>::vec_type* result)\r
- {\r
- typedef typename TypeVec<T, 3>::vec_type SrcType;\r
- typedef typename TypeVec<R, 3>::vec_type DstType;\r
+ sumInSmem<nthreads, R>(smem, tid);\r
\r
- __shared__ R smem[nthreads * 3];\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
+ if (tid == 0)\r
+ {\r
+ result[bid] = smem[0];\r
+ __threadfence();\r
\r
- SrcType val;\r
- DstType sum = VecTraits<DstType>::all(0);\r
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
- {\r
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
- {\r
- val = ptr[x0 + x * blockDim.x];\r
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));\r
- }\r
- }\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = (ticket == gridDim.x * gridDim.y - 1);\r
+ }\r
\r
- smem[tid] = sum.x;\r
- smem[tid + nthreads] = sum.y;\r
- smem[tid + 2 * nthreads] = sum.z;\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ if (is_last)\r
+ {\r
+ smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;\r
+ __syncthreads();\r
\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
+ sumInSmem<nthreads, R>(smem, tid);\r
\r
- if (tid == 0)\r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- result[bid] = res;\r
- __threadfence();\r
-\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = (ticket == gridDim.x * gridDim.y - 1);\r
- }\r
+ if (tid == 0) \r
+ {\r
+ result[0] = smem[0];\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) result[bid] = smem[0];\r
+ #endif\r
+ }\r
\r
- __syncthreads();\r
\r
- if (is_last)\r
- {\r
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- smem[tid + 2 * nthreads] = res.z;\r
- __syncthreads();\r
+ template <typename T, typename R, int nthreads>\r
+ __global__ void sumPass2Kernel(R* result, int size)\r
+ {\r
+ __shared__ R smem[nthreads];\r
+ int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ smem[tid] = tid < size ? result[tid] : 0;\r
+ __syncthreads();\r
\r
- if (tid == 0) \r
- {\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- result[0] = res;\r
- blocks_finished = 0;\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+\r
+ if (tid == 0) \r
+ result[0] = smem[0];\r
}\r
- }\r
-#else\r
- if (tid == 0) \r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- result[bid] = res;\r
- }\r
-#endif\r
- }\r
\r
\r
- template <typename T, typename R, int nthreads>\r
- __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)\r
- {\r
- typedef typename TypeVec<R, 3>::vec_type DstType;\r
+ template <typename T, typename R, typename Op, int nthreads>\r
+ __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec<R, 2>::vec_type* result)\r
+ {\r
+ typedef typename TypeVec<T, 2>::vec_type SrcType;\r
+ typedef typename TypeVec<R, 2>::vec_type DstType;\r
\r
- __shared__ R smem[nthreads * 3];\r
+ __shared__ R smem[nthreads * 2];\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
\r
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- smem[tid + 2 * nthreads] = res.z;\r
- __syncthreads();\r
+ SrcType val;\r
+ DstType sum = VecTraits<DstType>::all(0);\r
+ for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
+ for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ {\r
+ val = ptr[x0 + x * blockDim.x];\r
+ sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));\r
+ }\r
+ }\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ smem[tid] = sum.x;\r
+ smem[tid + nthreads] = sum.y;\r
+ __syncthreads();\r
\r
- if (tid == 0) \r
- {\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- result[0] = res;\r
- }\r
- }\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
\r
- template <typename T, typename R, typename Op, int nthreads>\r
- __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec<R, 4>::vec_type* result)\r
- {\r
- typedef typename TypeVec<T, 4>::vec_type SrcType;\r
- typedef typename TypeVec<R, 4>::vec_type DstType;\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
- __shared__ R smem[nthreads * 4];\r
+ if (tid == 0)\r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[bid] = res;\r
+ __threadfence();\r
+\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = (ticket == gridDim.x * gridDim.y - 1);\r
+ }\r
\r
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
+ __syncthreads();\r
\r
- SrcType val;\r
- DstType sum = VecTraits<DstType>::all(0);\r
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
- {\r
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
- {\r
- val = ptr[x0 + x * blockDim.x];\r
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), \r
- Op::call(val.z), Op::call(val.w));\r
+ if (is_last)\r
+ {\r
+ DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ __syncthreads();\r
+\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[0] = res;\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) \r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[bid] = res;\r
+ }\r
+ #endif\r
}\r
- }\r
\r
- smem[tid] = sum.x;\r
- smem[tid + nthreads] = sum.y;\r
- smem[tid + 2 * nthreads] = sum.z;\r
- smem[tid + 3 * nthreads] = sum.w;\r
- __syncthreads();\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
+ template <typename T, typename R, int nthreads>\r
+ __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)\r
+ {\r
+ typedef typename TypeVec<R, 2>::vec_type DstType;\r
\r
-#if __CUDA_ARCH__ >= 110\r
- __shared__ bool is_last;\r
+ __shared__ R smem[nthreads * 2];\r
\r
- if (tid == 0)\r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- res.w = smem[3 * nthreads];\r
- result[bid] = res;\r
- __threadfence();\r
-\r
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
- is_last = (ticket == gridDim.x * gridDim.y - 1);\r
- }\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- __syncthreads();\r
+ DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ __syncthreads();\r
\r
- if (is_last)\r
- {\r
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- smem[tid + 2 * nthreads] = res.z;\r
- smem[tid + 3 * nthreads] = res.w;\r
- __syncthreads();\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[0] = res;\r
+ }\r
+ }\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
\r
- if (tid == 0) \r
+ template <typename T, typename R, typename Op, int nthreads>\r
+ __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec<R, 3>::vec_type* result)\r
{\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- res.w = smem[3 * nthreads];\r
- result[0] = res;\r
- blocks_finished = 0;\r
- }\r
- }\r
-#else\r
- if (tid == 0) \r
- {\r
- DstType res;\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- res.w = smem[3 * nthreads];\r
- result[bid] = res;\r
- }\r
-#endif\r
- }\r
+ typedef typename TypeVec<T, 3>::vec_type SrcType;\r
+ typedef typename TypeVec<R, 3>::vec_type DstType;\r
\r
+ __shared__ R smem[nthreads * 3];\r
\r
- template <typename T, typename R, int nthreads>\r
- __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)\r
- {\r
- typedef typename TypeVec<R, 4>::vec_type DstType;\r
+ const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
\r
- __shared__ R smem[nthreads * 4];\r
+ SrcType val;\r
+ DstType sum = VecTraits<DstType>::all(0);\r
+ for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
+ for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ {\r
+ val = ptr[x0 + x * blockDim.x];\r
+ sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));\r
+ }\r
+ }\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ smem[tid] = sum.x;\r
+ smem[tid + nthreads] = sum.y;\r
+ smem[tid + 2 * nthreads] = sum.z;\r
+ __syncthreads();\r
\r
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
- smem[tid] = res.x;\r
- smem[tid + nthreads] = res.y;\r
- smem[tid + 2 * nthreads] = res.z;\r
- smem[tid + 3 * nthreads] = res.w;\r
- __syncthreads();\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
\r
- sumInSmem<nthreads, R>(smem, tid);\r
- sumInSmem<nthreads, R>(smem + nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
- if (tid == 0) \r
- {\r
- res.x = smem[0];\r
- res.y = smem[nthreads];\r
- res.z = smem[2 * nthreads];\r
- res.w = smem[3 * nthreads];\r
- result[0] = res;\r
- }\r
- }\r
+ if (tid == 0)\r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ result[bid] = res;\r
+ __threadfence();\r
+\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = (ticket == gridDim.x * gridDim.y - 1);\r
+ }\r
\r
- template <typename T>\r
- void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ __syncthreads();\r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ if (is_last)\r
+ {\r
+ DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ smem[tid + 2 * nthreads] = res.z;\r
+ __syncthreads();\r
+\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ result[0] = res;\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) \r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ result[bid] = res;\r
+ }\r
+ #endif\r
+ }\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T, typename R, int nthreads>\r
+ __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)\r
+ {\r
+ typedef typename TypeVec<R, 3>::vec_type DstType;\r
\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ __shared__ R smem[nthreads * 3];\r
\r
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ smem[tid + 2 * nthreads] = res.z;\r
+ __syncthreads();\r
\r
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ result[0] = res;\r
+ }\r
+ }\r
\r
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T, typename R, typename Op, int nthreads>\r
+ __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec<R, 4>::vec_type* result)\r
+ {\r
+ typedef typename TypeVec<T, 4>::vec_type SrcType;\r
+ typedef typename TypeVec<R, 4>::vec_type DstType;\r
\r
- break;\r
- }\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ __shared__ R smem[nthreads * 4];\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+ const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- } \r
+ SrcType val;\r
+ DstType sum = VecTraits<DstType>::all(0);\r
+ for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
+ for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ {\r
+ val = ptr[x0 + x * blockDim.x];\r
+ sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), \r
+ Op::call(val.z), Op::call(val.w));\r
+ }\r
+ }\r
\r
- template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ smem[tid] = sum.x;\r
+ smem[tid + nthreads] = sum.y;\r
+ smem[tid + 2 * nthreads] = sum.z;\r
+ smem[tid + 3 * nthreads] = sum.w;\r
+ __syncthreads();\r
\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
\r
- template <typename T>\r
- void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ #if __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ if (tid == 0)\r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ res.w = smem[3 * nthreads];\r
+ result[bid] = res;\r
+ __threadfence();\r
+\r
+ uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = (ticket == gridDim.x * gridDim.y - 1);\r
+ }\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- break;\r
- }\r
- cudaSafeCall( cudaGetLastError() );\r
+ __syncthreads();\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ if (is_last)\r
+ {\r
+ DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ smem[tid + 2 * nthreads] = res.z;\r
+ smem[tid + 3 * nthreads] = res.w;\r
+ __syncthreads();\r
+\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ res.w = smem[3 * nthreads];\r
+ result[0] = res;\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+ #else\r
+ if (tid == 0) \r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ res.w = smem[3 * nthreads];\r
+ result[bid] = res;\r
+ }\r
+ #endif\r
+ }\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- } \r
+ template <typename T, typename R, int nthreads>\r
+ __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)\r
+ {\r
+ typedef typename TypeVec<R, 4>::vec_type DstType;\r
\r
- template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ __shared__ R smem[nthreads * 4];\r
\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- template <typename T>\r
- void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ smem[tid + 2 * nthreads] = res.z;\r
+ smem[tid + 3 * nthreads] = res.w;\r
+ __syncthreads();\r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ sumInSmem<nthreads, R>(smem, tid);\r
+ sumInSmem<nthreads, R>(smem + nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
+ sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ res.z = smem[2 * nthreads];\r
+ res.w = smem[3 * nthreads];\r
+ result[0] = res;\r
+ }\r
+ }\r
\r
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ } \r
\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
\r
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- break;\r
- }\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ template <typename T>\r
+ void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- } \r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
- template <typename T>\r
- void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ } \r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
+ template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- break;\r
- }\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ template <typename T>\r
+ void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- }\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- template void absSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void absSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ } \r
\r
- template <typename T>\r
- void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ }\r
\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- cudaSafeCall( cudaGetLastError() );\r
+ template void absSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void absSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
\r
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
- cudaSafeCall( cudaGetLastError() );\r
\r
- break;\r
- }\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ template <typename T>\r
+ void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- } \r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ } \r
\r
- template <typename T>\r
- void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
- {\r
- typedef typename SumType<T>::R R;\r
+ template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
\r
- dim3 threads, grid;\r
- estimateThreadCfg(src.cols, src.rows, threads, grid);\r
- setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- switch (cn)\r
- {\r
- case 1:\r
- sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
- break;\r
- case 2:\r
- sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
- break;\r
- case 3:\r
- sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
- break;\r
- case 4:\r
- sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
- break;\r
- }\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
+ {\r
+ typedef typename SumType<T>::R R;\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
\r
- R result[4] = {0, 0, 0, 0};\r
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));\r
+ break;\r
+ }\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- sum[0] = result[0];\r
- sum[1] = result[1];\r
- sum[2] = result[2];\r
- sum[3] = result[3];\r
- }\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- template void sqrSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
- template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
-} // namespace sum\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
-//////////////////////////////////////////////////////////////////////////////\r
-// reduce\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ }\r
\r
-template <typename S> struct SumReductor\r
-{\r
- __device__ __forceinline__ S startValue() const\r
- {\r
- return 0;\r
- }\r
+ template void sqrSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
+ template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+ } // namespace sum\r
\r
- __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
- {\r
- return a + b;\r
- }\r
+ //////////////////////////////////////////////////////////////////////////////\r
+ // reduce\r
\r
- __device__ __forceinline__ S result(S r, double) const\r
- {\r
- return r;\r
- }\r
-};\r
+ template <typename S> struct SumReductor\r
+ {\r
+ __device__ __forceinline__ S startValue() const\r
+ {\r
+ return 0;\r
+ }\r
\r
-template <typename S> struct AvgReductor\r
-{\r
- __device__ __forceinline__ S startValue() const\r
- {\r
- return 0;\r
- }\r
+ __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
+ {\r
+ return a + b;\r
+ }\r
\r
- __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
- {\r
- return a + b;\r
- }\r
+ __device__ __forceinline__ S result(S r, double) const\r
+ {\r
+ return r;\r
+ }\r
+ };\r
\r
- __device__ __forceinline__ double result(S r, double sz) const\r
- {\r
- return r / sz;\r
- }\r
-};\r
+ template <typename S> struct AvgReductor\r
+ {\r
+ __device__ __forceinline__ S startValue() const\r
+ {\r
+ return 0;\r
+ }\r
\r
-template <typename S> struct MinReductor\r
-{\r
- __device__ __forceinline__ S startValue() const\r
- {\r
- return numeric_limits<S>::max();\r
- }\r
+ __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
+ {\r
+ return a + b;\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const\r
- {\r
- return saturate_cast<T>(::min(a, b));\r
- }\r
- __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
- {\r
- return ::fmin(a, b);\r
- }\r
+ __device__ __forceinline__ double result(S r, double sz) const\r
+ {\r
+ return r / sz;\r
+ }\r
+ };\r
\r
- __device__ __forceinline__ S result(S r, double) const\r
- {\r
- return r;\r
- }\r
-};\r
+ template <typename S> struct MinReductor\r
+ {\r
+ __device__ __forceinline__ S startValue() const\r
+ {\r
+ return numeric_limits<S>::max();\r
+ }\r
\r
-template <typename S> struct MaxReductor\r
-{\r
- __device__ __forceinline__ S startValue() const\r
- {\r
- return numeric_limits<S>::min();\r
- }\r
+ template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const\r
+ {\r
+ return saturate_cast<T>(::min(a, b));\r
+ }\r
+ __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
+ {\r
+ return ::fmin(a, b);\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const\r
- {\r
- return ::max(a, b);\r
- }\r
- __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
- {\r
- return ::fmax(a, b);\r
- }\r
+ __device__ __forceinline__ S result(S r, double) const\r
+ {\r
+ return r;\r
+ }\r
+ };\r
\r
- __device__ __forceinline__ S result(S r, double) const\r
- {\r
- return r;\r
- }\r
-};\r
+ template <typename S> struct MaxReductor\r
+ {\r
+ __device__ __forceinline__ S startValue() const\r
+ {\r
+ return numeric_limits<S>::min();\r
+ }\r
\r
-template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)\r
-{\r
- __shared__ S smem[16 * 16];\r
+ template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const\r
+ {\r
+ return ::max(a, b);\r
+ }\r
+ __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
+ {\r
+ return ::fmax(a, b);\r
+ }\r
\r
- const int x = blockIdx.x * 16 + threadIdx.x;\r
+ __device__ __forceinline__ S result(S r, double) const\r
+ {\r
+ return r;\r
+ }\r
+ };\r
\r
- S myVal = op.startValue();\r
+ template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)\r
+ {\r
+ __shared__ S smem[16 * 16];\r
\r
- if (x < src.cols)\r
- {\r
- for (int y = threadIdx.y; y < src.rows; y += 16)\r
- myVal = op(myVal, src.ptr(y)[x]);\r
- } \r
+ const int x = blockIdx.x * 16 + threadIdx.x;\r
\r
- smem[threadIdx.x * 16 + threadIdx.y] = myVal;\r
- __syncthreads();\r
+ S myVal = op.startValue();\r
\r
- if (threadIdx.x < 8)\r
- {\r
- volatile S* srow = smem + threadIdx.y * 16;\r
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);\r
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);\r
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);\r
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);\r
- }\r
- __syncthreads();\r
-\r
- if (threadIdx.y == 0 && x < src.cols)\r
- dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));\r
-}\r
-\r
-template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
-{\r
- const dim3 block(16, 16);\r
- const dim3 grid(divUp(src.cols, block.x));\r
+ if (x < src.cols)\r
+ {\r
+ for (int y = threadIdx.y; y < src.rows; y += 16)\r
+ myVal = op(myVal, src.ptr(y)[x]);\r
+ } \r
\r
- Op<S> op;\r
- reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
- cudaSafeCall( cudaGetLastError() );\r
+ smem[threadIdx.x * 16 + threadIdx.y] = myVal;\r
+ __syncthreads();\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ if (threadIdx.x < 8)\r
+ {\r
+ volatile S* srow = smem + threadIdx.y * 16;\r
+ srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);\r
+ srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);\r
+ srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);\r
+ srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);\r
+ }\r
+ __syncthreads();\r
\r
-}\r
+ if (threadIdx.y == 0 && x < src.cols)\r
+ dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));\r
+ }\r
\r
-template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
+ template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
+ {\r
+ const dim3 block(16, 16);\r
+ const dim3 grid(divUp(src.cols, block.x));\r
\r
- static const caller_t callers[] = \r
- {\r
- reduceRows_caller<SumReductor, T, S, D>, \r
- reduceRows_caller<AvgReductor, T, S, D>, \r
- reduceRows_caller<MaxReductor, T, S, D>, \r
- reduceRows_caller<MinReductor, T, S, D>\r
- };\r
+ Op<S> op;\r
+ reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
-template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ }\r
\r
-template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
\r
-template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ static const caller_t callers[] = \r
+ {\r
+ reduceRows_caller<SumReductor, T, S, D>, \r
+ reduceRows_caller<AvgReductor, T, S, D>, \r
+ reduceRows_caller<MaxReductor, T, S, D>, \r
+ reduceRows_caller<MinReductor, T, S, D>\r
+ };\r
\r
-template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
+ }\r
\r
-template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
\r
+ template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
\r
+ template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
\r
-template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)\r
-{\r
- __shared__ S smem[256 * cn];\r
+ template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
- const int y = blockIdx.x;\r
+ template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
- const T* src_row = src.ptr(y);\r
\r
- S myVal[cn];\r
\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- myVal[c] = op.startValue();\r
+ template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)\r
+ {\r
+ __shared__ S smem[256 * cn];\r
\r
-#if __CUDA_ARCH__ >= 200\r
+ const int y = blockIdx.x;\r
\r
- // For cc >= 2.0 prefer L1 cache\r
- for (int x = threadIdx.x; x < src.cols; x += 256)\r
- {\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- myVal[c] = op(myVal[c], src_row[x * cn + c]);\r
- }\r
+ const T* src_row = src.ptr(y);\r
\r
-#else // __CUDA_ARCH__ >= 200\r
+ S myVal[cn];\r
\r
- // For older arch use shared memory for cache\r
- for (int x = 0; x < src.cols; x += 256)\r
- {\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- {\r
- smem[c * 256 + threadIdx.x] = op.startValue();\r
- const int load_x = x * cn + c * 256 + threadIdx.x;\r
- if (load_x < src.cols * cn)\r
- smem[c * 256 + threadIdx.x] = src_row[load_x];\r
- }\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ myVal[c] = op.startValue();\r
\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);\r
- __syncthreads();\r
- }\r
+ #if __CUDA_ARCH__ >= 200\r
\r
-#endif // __CUDA_ARCH__ >= 200\r
+ // For cc >= 2.0 prefer L1 cache\r
+ for (int x = threadIdx.x; x < src.cols; x += 256)\r
+ {\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ myVal[c] = op(myVal[c], src_row[x * cn + c]);\r
+ }\r
\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- smem[c * 256 + threadIdx.x] = myVal[c];\r
- __syncthreads();\r
+ #else // __CUDA_ARCH__ >= 200\r
\r
- if (threadIdx.x < 128)\r
- {\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);\r
- }\r
- __syncthreads();\r
+ // For older arch use shared memory for cache\r
+ for (int x = 0; x < src.cols; x += 256)\r
+ {\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ {\r
+ smem[c * 256 + threadIdx.x] = op.startValue();\r
+ const int load_x = x * cn + c * 256 + threadIdx.x;\r
+ if (load_x < src.cols * cn)\r
+ smem[c * 256 + threadIdx.x] = src_row[load_x];\r
+ }\r
+ __syncthreads();\r
\r
- if (threadIdx.x < 64)\r
- {\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);\r
- }\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);\r
+ __syncthreads();\r
+ }\r
\r
- volatile S* sdata = smem;\r
+ #endif // __CUDA_ARCH__ >= 200\r
\r
- if (threadIdx.x < 32)\r
- {\r
- #pragma unroll\r
- for (int c = 0; c < cn; ++c)\r
- {\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);\r
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);\r
- }\r
- }\r
- __syncthreads();\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ smem[c * 256 + threadIdx.x] = myVal[c];\r
+ __syncthreads();\r
\r
- if (threadIdx.x < cn)\r
- dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));\r
-}\r
+ if (threadIdx.x < 128)\r
+ {\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);\r
+ }\r
+ __syncthreads();\r
\r
-template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
-{\r
- const dim3 block(256);\r
- const dim3 grid(src.rows);\r
+ if (threadIdx.x < 64)\r
+ {\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);\r
+ }\r
+ __syncthreads();\r
\r
- Op<S> op;\r
- reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
- cudaSafeCall( cudaGetLastError() );\r
+ volatile S* sdata = smem;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ if (threadIdx.x < 32)\r
+ {\r
+ #pragma unroll\r
+ for (int c = 0; c < cn; ++c)\r
+ {\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);\r
+ sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);\r
+ }\r
+ }\r
+ __syncthreads();\r
\r
-}\r
+ if (threadIdx.x < cn)\r
+ dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));\r
+ }\r
\r
-template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
+ template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
+ {\r
+ const dim3 block(256);\r
+ const dim3 grid(src.rows);\r
\r
- static const caller_t callers[4][4] = \r
- {\r
- {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},\r
- {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},\r
- {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},\r
- {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},\r
- };\r
+ Op<S> op;\r
+ reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
-template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ }\r
+\r
+ template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
\r
-template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ static const caller_t callers[4][4] = \r
+ {\r
+ {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},\r
+ {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},\r
+ {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},\r
+ {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},\r
+ };\r
+\r
+ callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
+ }\r
\r
-template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
-template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
-template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
\r
-} // namespace mattrix_reductions\r
+ template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+ template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ } // namespace mattrix_reductions\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/vec_math.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
-\r
-template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
-{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y;\r
-\r
- __shared__ value_type smem[256 + 4];\r
-\r
- value_type sum;\r
- \r
- const int src_y = 2*y;\r
-\r
- sum = VecTraits<value_type>::all(0);\r
- \r
- sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
-\r
- smem[2 + threadIdx.x] = sum;\r
-\r
- if (threadIdx.x < 2)\r
- {\r
- const int left_x = x - 2 + threadIdx.x;\r
-\r
- sum = VecTraits<value_type>::all(0);\r
- \r
- sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
-\r
- smem[threadIdx.x] = sum;\r
- }\r
-\r
- if (threadIdx.x > 253)\r
- {\r
- const int right_x = x + threadIdx.x + 2;\r
-\r
- sum = VecTraits<value_type>::all(0);\r
- \r
- sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
-\r
- smem[4 + threadIdx.x] = sum;\r
- }\r
-\r
- __syncthreads();\r
-\r
- if (threadIdx.x < 128)\r
- {\r
- const int tid2 = threadIdx.x * 2;\r
-\r
- sum = VecTraits<value_type>::all(0);\r
-\r
- sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
- sum = sum + 0.25f * smem[2 + tid2 - 1];\r
- sum = sum + 0.375f * smem[2 + tid2 ];\r
- sum = sum + 0.25f * smem[2 + tid2 + 1];\r
- sum = sum + 0.0625f * smem[2 + tid2 + 2];\r
-\r
- const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;\r
-\r
- if (dst_x < dst_cols)\r
- dst.ptr(y)[dst_x] = saturate_cast<T>(sum);\r
- }\r
-}\r
-\r
-template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- const dim3 block(256);\r
- const dim3 grid(divUp(src.cols, block.x), dst.rows);\r
-\r
- B<T> b(src.rows, src.cols);\r
-\r
- pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
-{\r
- typedef typename TypeVec<T, cn>::vec_type type;\r
-\r
- typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
-\r
- static const caller_t callers[] = \r
+ namespace imgproc \r
{\r
- pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>\r
- };\r
-\r
- callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
-}\r
-\r
-template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
\r
-template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y;\r
\r
-template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ __shared__ value_type smem[256 + 4];\r
\r
-template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ value_type sum;\r
+ \r
+ const int src_y = 2*y;\r
\r
-template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ sum = VecTraits<value_type>::all(0);\r
+ \r
+ sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);\r
+ sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);\r
+ sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
+\r
+ smem[2 + threadIdx.x] = sum;\r
\r
-} // namespace imgproc\r
+ if (threadIdx.x < 2)\r
+ {\r
+ const int left_x = x - 2 + threadIdx.x;\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ sum = VecTraits<value_type>::all(0);\r
+ \r
+ sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);\r
+ sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);\r
+ sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
+\r
+ smem[threadIdx.x] = sum;\r
+ }\r
+\r
+ if (threadIdx.x > 253)\r
+ {\r
+ const int right_x = x + threadIdx.x + 2;\r
+\r
+ sum = VecTraits<value_type>::all(0);\r
+ \r
+ sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);\r
+ sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);\r
+ sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);\r
+ sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
+\r
+ smem[4 + threadIdx.x] = sum;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (threadIdx.x < 128)\r
+ {\r
+ const int tid2 = threadIdx.x * 2;\r
+\r
+ sum = VecTraits<value_type>::all(0);\r
+\r
+ sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
+ sum = sum + 0.25f * smem[2 + tid2 - 1];\r
+ sum = sum + 0.375f * smem[2 + tid2 ];\r
+ sum = sum + 0.25f * smem[2 + tid2 + 1];\r
+ sum = sum + 0.0625f * smem[2 + tid2 + 2];\r
+\r
+ const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;\r
+\r
+ if (dst_x < dst_cols)\r
+ dst.ptr(y)[dst_x] = saturate_cast<T>(sum);\r
+ }\r
+ }\r
+\r
+ template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ const dim3 block(256);\r
+ const dim3 grid(divUp(src.cols, block.x), dst.rows);\r
+\r
+ B<T> b(src.rows, src.cols);\r
+\r
+ pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+\r
+ template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
+ {\r
+ typedef typename TypeVec<T, cn>::vec_type type;\r
+\r
+ typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
+\r
+ static const caller_t callers[] = \r
+ {\r
+ pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>\r
+ };\r
+\r
+ callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
+ }\r
+\r
+ template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/vec_math.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
-\r
-template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+ namespace imgproc \r
+ {\r
+ template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- __shared__ T smem1[10][10];\r
- __shared__ value_type smem2[20][16];\r
+ __shared__ T smem1[10][10];\r
+ __shared__ value_type smem2[20][16];\r
\r
- value_type sum;\r
+ value_type sum;\r
\r
- if (threadIdx.x < 10 && threadIdx.y < 10)\r
- smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);\r
+ if (threadIdx.x < 10 && threadIdx.y < 10)\r
+ smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- const int tidx = threadIdx.x;\r
+ const int tidx = threadIdx.x;\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ sum = VecTraits<value_type>::all(0);\r
\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];\r
\r
- smem2[2 + threadIdx.y][tidx] = sum;\r
+ smem2[2 + threadIdx.y][tidx] = sum;\r
\r
- if (threadIdx.y < 2)\r
- {\r
- sum = VecTraits<value_type>::all(0);\r
+ if (threadIdx.y < 2)\r
+ {\r
+ sum = VecTraits<value_type>::all(0);\r
\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];\r
\r
- smem2[threadIdx.y][tidx] = sum;\r
- }\r
+ smem2[threadIdx.y][tidx] = sum;\r
+ }\r
\r
- if (threadIdx.y > 13)\r
- {\r
- sum = VecTraits<value_type>::all(0);\r
+ if (threadIdx.y > 13)\r
+ {\r
+ sum = VecTraits<value_type>::all(0);\r
\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];\r
\r
- smem2[4 + threadIdx.y][tidx] = sum;\r
- }\r
+ smem2[4 + threadIdx.y][tidx] = sum;\r
+ }\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ sum = VecTraits<value_type>::all(0);\r
\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];\r
- sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];\r
- sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];\r
- sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];\r
+ sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];\r
+ sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];\r
+ sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];\r
\r
- if (x < dst.cols && y < dst.rows)\r
- dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);\r
-}\r
+ if (x < dst.cols && y < dst.rows)\r
+ dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);\r
+ }\r
\r
-template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-{\r
- const dim3 block(16, 16);\r
- const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+ template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ const dim3 block(16, 16);\r
+ const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
\r
- B<T> b(src.rows, src.cols);\r
+ B<T> b(src.rows, src.cols);\r
\r
- pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);\r
- cudaSafeCall( cudaGetLastError() );\r
+ pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
-{\r
- typedef typename TypeVec<T, cn>::vec_type type;\r
-\r
- typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
-\r
- static const caller_t callers[] = \r
- {\r
- pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>\r
- };\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
-}\r
+ template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
+ {\r
+ typedef typename TypeVec<T, cn>::vec_type type;\r
\r
-template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
\r
-template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ static const caller_t callers[] = \r
+ {\r
+ pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>\r
+ };\r
\r
-template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
+ }\r
\r
-template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
-template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
-template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
-} // namespace imgproc\r
+ template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+ template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/filters.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
- \r
-template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < dst.cols && y < dst.rows)\r
- {\r
- const float xcoo = mapx.ptr(y)[x];\r
- const float ycoo = mapy.ptr(y)[x];\r
-\r
- dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
- }\r
-}\r
-\r
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream\r
+namespace cv { namespace gpu { namespace device \r
{\r
- static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
- const float* borderValue, cudaStream_t stream, int)\r
- {\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
- \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
- BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
- Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
-\r
- remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
-};\r
-\r
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream\r
-{\r
- static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)\r
- {\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
- \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
- BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
- Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
-\r
- remap<<<grid, block>>>(filter_src, mapx, mapy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-};\r
-\r
-#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \\r
- texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
- struct tex_remap_ ## type ## _reader \\r
- { \\r
- typedef type elem_type; \\r
- typedef int index_type; \\r
- __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \\r
- { \\r
- return tex2D(tex_remap_ ## type , x, y); \\r
- } \\r
- }; \\r
- template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \\r
- { \\r
- static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \\r
- { \\r
- typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \\r
- dim3 block(32, cc >= 20 ? 8 : 4); \\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
- bindTexture(&tex_remap_ ## type , src); \\r
- tex_remap_ ## type ##_reader texSrc; \\r
- B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \\r
- BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \\r
- Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \\r
- remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \\r
- cudaSafeCall( cudaGetLastError() ); \\r
- cudaSafeCall( cudaDeviceSynchronize() ); \\r
- } \\r
- }; \\r
- template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \\r
- { \\r
- static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \\r
- { \\r
- dim3 block(32, 8); \\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
- bindTexture(&tex_remap_ ## type , src); \\r
- tex_remap_ ## type ##_reader texSrc; \\r
- Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \\r
- remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \\r
- cudaSafeCall( cudaGetLastError() ); \\r
- cudaSafeCall( cudaDeviceSynchronize() ); \\r
- } \\r
- };\r
- \r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)\r
-\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)\r
-\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)\r
-\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)\r
-\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)\r
-\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)\r
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)\r
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX\r
-\r
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher\r
-{ \r
- static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
- const float* borderValue, cudaStream_t stream, int cc)\r
- {\r
- if (stream == 0)\r
- RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);\r
- else\r
- RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);\r
- }\r
-};\r
-\r
-template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
- int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, \r
- const float* borderValue, cudaStream_t stream, int cc);\r
+ namespace imgproc \r
+ { \r
+ template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < dst.cols && y < dst.rows)\r
+ {\r
+ const float xcoo = mapx.ptr(y)[x];\r
+ const float ycoo = mapy.ptr(y)[x];\r
+\r
+ dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
+ }\r
+ }\r
\r
- static const caller_t callers[3][5] = \r
- {\r
- { \r
- RemapDispatcher<PointFilter, BrdReflect101, T>::call, \r
- RemapDispatcher<PointFilter, BrdReplicate, T>::call, \r
- RemapDispatcher<PointFilter, BrdConstant, T>::call, \r
- RemapDispatcher<PointFilter, BrdReflect, T>::call, \r
- RemapDispatcher<PointFilter, BrdWrap, T>::call \r
- },\r
+ template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream\r
+ {\r
+ static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
+ const float* borderValue, cudaStream_t stream, int)\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+ \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
+ BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
+ Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
+\r
+ remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+ };\r
+\r
+ template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream\r
+ {\r
+ static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+ \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
+ BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
+ Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
+\r
+ remap<<<grid, block>>>(filter_src, mapx, mapy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ };\r
+\r
+ #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \\r
+ texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
+ struct tex_remap_ ## type ## _reader \\r
+ { \\r
+ typedef type elem_type; \\r
+ typedef int index_type; \\r
+ __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \\r
+ { \\r
+ return tex2D(tex_remap_ ## type , x, y); \\r
+ } \\r
+ }; \\r
+ template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \\r
+ { \\r
+ static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \\r
+ { \\r
+ typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \\r
+ dim3 block(32, cc >= 20 ? 8 : 4); \\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
+ bindTexture(&tex_remap_ ## type , src); \\r
+ tex_remap_ ## type ##_reader texSrc; \\r
+ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \\r
+ BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \\r
+ Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \\r
+ remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \\r
+ cudaSafeCall( cudaGetLastError() ); \\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \\r
+ } \\r
+ }; \\r
+ template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \\r
+ { \\r
+ static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \\r
+ { \\r
+ dim3 block(32, 8); \\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
+ bindTexture(&tex_remap_ ## type , src); \\r
+ tex_remap_ ## type ##_reader texSrc; \\r
+ Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \\r
+ remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \\r
+ cudaSafeCall( cudaGetLastError() ); \\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \\r
+ } \\r
+ };\r
+ \r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)\r
+\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)\r
+\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)\r
+ //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)\r
+ OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX\r
+\r
+ template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher\r
{ \r
- RemapDispatcher<LinearFilter, BrdReflect101, T>::call, \r
- RemapDispatcher<LinearFilter, BrdReplicate, T>::call, \r
- RemapDispatcher<LinearFilter, BrdConstant, T>::call, \r
- RemapDispatcher<LinearFilter, BrdReflect, T>::call, \r
- RemapDispatcher<LinearFilter, BrdWrap, T>::call \r
- },\r
- { \r
- RemapDispatcher<CubicFilter, BrdReflect101, T>::call, \r
- RemapDispatcher<CubicFilter, BrdReplicate, T>::call, \r
- RemapDispatcher<CubicFilter, BrdConstant, T>::call, \r
- RemapDispatcher<CubicFilter, BrdReflect, T>::call, \r
- RemapDispatcher<CubicFilter, BrdWrap, T>::call \r
+ static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
+ const float* borderValue, cudaStream_t stream, int cc)\r
+ {\r
+ if (stream == 0)\r
+ RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);\r
+ else\r
+ RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);\r
+ }\r
+ };\r
+\r
+ template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
+ int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, \r
+ const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ static const caller_t callers[3][5] = \r
+ {\r
+ { \r
+ RemapDispatcher<PointFilter, BrdReflect101, T>::call, \r
+ RemapDispatcher<PointFilter, BrdReplicate, T>::call, \r
+ RemapDispatcher<PointFilter, BrdConstant, T>::call, \r
+ RemapDispatcher<PointFilter, BrdReflect, T>::call, \r
+ RemapDispatcher<PointFilter, BrdWrap, T>::call \r
+ },\r
+ { \r
+ RemapDispatcher<LinearFilter, BrdReflect101, T>::call, \r
+ RemapDispatcher<LinearFilter, BrdReplicate, T>::call, \r
+ RemapDispatcher<LinearFilter, BrdConstant, T>::call, \r
+ RemapDispatcher<LinearFilter, BrdReflect, T>::call, \r
+ RemapDispatcher<LinearFilter, BrdWrap, T>::call \r
+ },\r
+ { \r
+ RemapDispatcher<CubicFilter, BrdReflect101, T>::call, \r
+ RemapDispatcher<CubicFilter, BrdReplicate, T>::call, \r
+ RemapDispatcher<CubicFilter, BrdConstant, T>::call, \r
+ RemapDispatcher<CubicFilter, BrdReflect, T>::call, \r
+ RemapDispatcher<CubicFilter, BrdWrap, T>::call \r
+ }\r
+ };\r
+\r
+ callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);\r
}\r
- };\r
-\r
- callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);\r
-}\r
-\r
-template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-} // namespace imgproc\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+ template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/filters.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc {\r
- \r
-template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < dst.cols && y < dst.rows)\r
- {\r
- const float xcoo = x / fx;\r
- const float ycoo = y / fy;\r
-\r
- dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
- }\r
-}\r
-template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
-{\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < dst.cols && y < dst.rows)\r
- {\r
- const float xcoo = x / fx;\r
- const float ycoo = y / fy;\r
-\r
- dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));\r
- }\r
-}\r
-\r
-template <template <typename> class Filter, typename T> struct ResizeDispatcherStream\r
-{\r
- static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
- { \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- BrdReplicate<T> brd(src.rows, src.cols);\r
- BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
- Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
-\r
- resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
-};\r
-template <typename T> struct ResizeDispatcherStream<PointFilter, T>\r
-{\r
- static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
- { \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- BrdReplicate<T> brd(src.rows, src.cols);\r
- BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
-\r
- resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
-};\r
-\r
-template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream\r
-{\r
- static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
- { \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- BrdReplicate<T> brd(src.rows, src.cols);\r
- BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
- Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
-\r
- resize<<<grid, block>>>(filter_src, fx, fy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-};\r
-template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>\r
+namespace cv { namespace gpu { namespace device \r
{\r
- static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
- { \r
- dim3 block(32, 8);\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
- BrdReplicate<T> brd(src.rows, src.cols);\r
- BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
-\r
- resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-};\r
-\r
-#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \\r
- texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
- struct tex_resize_ ## type ## _reader \\r
- { \\r
- typedef type elem_type; \\r
- typedef int index_type; \\r
- __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \\r
- { \\r
- return tex2D(tex_resize_ ## type , x, y); \\r
- } \\r
- }; \\r
- template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \\r
- { \\r
- static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \\r
- { \\r
- dim3 block(32, 8); \\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
- bindTexture(&tex_resize_ ## type , src); \\r
- tex_resize_ ## type ##_reader texSrc; \\r
- Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \\r
- resize<<<grid, block>>>(filter_src, fx, fy, dst); \\r
- cudaSafeCall( cudaGetLastError() ); \\r
- cudaSafeCall( cudaDeviceSynchronize() ); \\r
- } \\r
- }; \\r
- template <> struct ResizeDispatcherNonStream<PointFilter, type> \\r
- { \\r
- static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \\r
- { \\r
- dim3 block(32, 8); \\r
- dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
- bindTexture(&tex_resize_ ## type , src); \\r
- tex_resize_ ## type ##_reader texSrc; \\r
- resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \\r
- cudaSafeCall( cudaGetLastError() ); \\r
- cudaSafeCall( cudaDeviceSynchronize() ); \\r
- } \\r
- };\r
- \r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)\r
-\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)\r
-\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)\r
-\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)\r
-\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)\r
-\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)\r
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)\r
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX\r
-\r
-template <template <typename> class Filter, typename T> struct ResizeDispatcher\r
-{ \r
- static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
- {\r
- if (stream == 0)\r
- ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);\r
- else\r
- ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);\r
- }\r
-};\r
-\r
-template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);\r
-\r
- static const caller_t callers[3] = \r
- {\r
- ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call\r
- };\r
-\r
- callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);\r
-}\r
-\r
-template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-\r
-} // namespace imgproc\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ { \r
+ template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < dst.cols && y < dst.rows)\r
+ {\r
+ const float xcoo = x / fx;\r
+ const float ycoo = y / fy;\r
+\r
+ dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
+ }\r
+ }\r
+ template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
+ {\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < dst.cols && y < dst.rows)\r
+ {\r
+ const float xcoo = x / fx;\r
+ const float ycoo = y / fy;\r
+\r
+ dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));\r
+ }\r
+ }\r
+\r
+ template <template <typename> class Filter, typename T> struct ResizeDispatcherStream\r
+ {\r
+ static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ { \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ BrdReplicate<T> brd(src.rows, src.cols);\r
+ BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+ Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
+\r
+ resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+ };\r
+ template <typename T> struct ResizeDispatcherStream<PointFilter, T>\r
+ {\r
+ static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ { \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ BrdReplicate<T> brd(src.rows, src.cols);\r
+ BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+\r
+ resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+ }\r
+ };\r
+\r
+ template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream\r
+ {\r
+ static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
+ { \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ BrdReplicate<T> brd(src.rows, src.cols);\r
+ BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+ Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
+\r
+ resize<<<grid, block>>>(filter_src, fx, fy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ };\r
+ template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>\r
+ {\r
+ static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
+ { \r
+ dim3 block(32, 8);\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+ BrdReplicate<T> brd(src.rows, src.cols);\r
+ BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+\r
+ resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ };\r
+\r
+ #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \\r
+ texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
+ struct tex_resize_ ## type ## _reader \\r
+ { \\r
+ typedef type elem_type; \\r
+ typedef int index_type; \\r
+ __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \\r
+ { \\r
+ return tex2D(tex_resize_ ## type , x, y); \\r
+ } \\r
+ }; \\r
+ template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \\r
+ { \\r
+ static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \\r
+ { \\r
+ dim3 block(32, 8); \\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
+ bindTexture(&tex_resize_ ## type , src); \\r
+ tex_resize_ ## type ##_reader texSrc; \\r
+ Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \\r
+ resize<<<grid, block>>>(filter_src, fx, fy, dst); \\r
+ cudaSafeCall( cudaGetLastError() ); \\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \\r
+ } \\r
+ }; \\r
+ template <> struct ResizeDispatcherNonStream<PointFilter, type> \\r
+ { \\r
+ static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \\r
+ { \\r
+ dim3 block(32, 8); \\r
+ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
+ bindTexture(&tex_resize_ ## type , src); \\r
+ tex_resize_ ## type ##_reader texSrc; \\r
+ resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \\r
+ cudaSafeCall( cudaGetLastError() ); \\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \\r
+ } \\r
+ };\r
+ \r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)\r
+\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)\r
+\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)\r
+\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)\r
+ //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)\r
+ OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX\r
+\r
+ template <template <typename> class Filter, typename T> struct ResizeDispatcher\r
+ { \r
+ static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+ {\r
+ if (stream == 0)\r
+ ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);\r
+ else\r
+ ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);\r
+ }\r
+ };\r
+\r
+ template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)\r
+ {\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+\r
+ static const caller_t callers[3] = \r
+ {\r
+ ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call\r
+ };\r
+\r
+ callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);\r
+ }\r
+\r
+ template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+ //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+ template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+ template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+ //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+ template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ } // namespace imgproc\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/limits.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#define MAX_KERNEL_SIZE 16\r
-#define BLOCK_DIM_X 16\r
-#define BLOCK_DIM_Y 4\r
-#define RESULT_STEPS 8\r
-#define HALO_STEPS 1\r
-\r
-namespace row_filter {\r
-\r
-__constant__ float c_kernel[MAX_KERNEL_SIZE];\r
-\r
-void loadKernel(const float kernel[], int ksize)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
-}\r
+ #define MAX_KERNEL_SIZE 16\r
+ #define BLOCK_DIM_X 16\r
+ #define BLOCK_DIM_Y 4\r
+ #define RESULT_STEPS 8\r
+ #define HALO_STEPS 1\r
\r
-namespace detail\r
-{\r
- template <typename T, size_t size> struct SmemType\r
+ namespace row_filter \r
{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;\r
- };\r
+ __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
\r
- template <typename T> struct SmemType<T, 4>\r
- {\r
- typedef T smem_t;\r
- };\r
-}\r
+ void loadKernel(const float kernel[], int ksize)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+ }\r
\r
-template <typename T> struct SmemType\r
-{\r
- typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;\r
-};\r
+ namespace detail\r
+ {\r
+ template <typename T, size_t size> struct SmemType\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;\r
+ };\r
+\r
+ template <typename T> struct SmemType<T, 4>\r
+ {\r
+ typedef T smem_t;\r
+ };\r
+ }\r
\r
-template <int KERNEL_SIZE, typename T, typename D, typename B>\r
-__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
-{\r
- typedef typename SmemType<T>::smem_t smem_t;\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
+ template <typename T> struct SmemType\r
+ {\r
+ typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;\r
+ };\r
\r
- __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];\r
+ template <int KERNEL_SIZE, typename T, typename D, typename B>\r
+ __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
+ {\r
+ typedef typename SmemType<T>::smem_t smem_t;\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
\r
- //Offset to the left halo edge\r
- const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;\r
- const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+ __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];\r
\r
- if (y < src.rows)\r
- {\r
- const T* src_row = src.ptr(y);\r
+ //Offset to the left halo edge\r
+ const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;\r
+ const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
\r
- //Load main data\r
- #pragma unroll\r
- for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
- smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
+ if (y < src.rows)\r
+ {\r
+ const T* src_row = src.ptr(y);\r
\r
- //Load left halo\r
- #pragma unroll\r
- for(int i = 0; i < HALO_STEPS; ++i)\r
- smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);\r
+ //Load main data\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+ smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
\r
- //Load right halo\r
- #pragma unroll\r
- for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
- smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
+ //Load left halo\r
+ #pragma unroll\r
+ for(int i = 0; i < HALO_STEPS; ++i)\r
+ smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);\r
\r
- __syncthreads();\r
+ //Load right halo\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
+ smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
\r
- D* dst_row = dst.ptr(y);\r
+ __syncthreads();\r
\r
- #pragma unroll\r
- for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
- {\r
- sum_t sum = VecTraits<sum_t>::all(0);\r
+ D* dst_row = dst.ptr(y);\r
+\r
+ #pragma unroll\r
+ for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+ {\r
+ sum_t sum = VecTraits<sum_t>::all(0);\r
\r
- #pragma unroll\r
- for (int j = 0; j < KERNEL_SIZE; ++j)\r
- sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];\r
+ #pragma unroll\r
+ for (int j = 0; j < KERNEL_SIZE; ++j)\r
+ sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];\r
\r
- int dstX = x + i * BLOCK_DIM_X;\r
+ int dstX = x + i * BLOCK_DIM_X;\r
\r
- if (dstX < src.cols)\r
- dst_row[dstX] = saturate_cast<D>(sum);\r
+ if (dstX < src.cols)\r
+ dst_row[dstX] = saturate_cast<D>(sum);\r
+ }\r
+ }\r
}\r
- }\r
-}\r
\r
-template <int ksize, typename T, typename D, template<typename> class B>\r
-void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
-{\r
- typedef typename SmemType<T>::smem_t smem_t;\r
+ template <int ksize, typename T, typename D, template<typename> class B>\r
+ void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
+ {\r
+ typedef typename SmemType<T>::smem_t smem_t;\r
\r
- const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
- const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
+ const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
+ const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
\r
- B<smem_t> b(src.cols);\r
+ B<smem_t> b(src.cols);\r
\r
- linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
- cudaSafeCall( cudaGetLastError() );\r
+ linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T, typename D>\r
-void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
-{\r
- typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
- static const caller_t callers[5][17] = \r
- {\r
+ template <typename T, typename D>\r
+ void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
{\r
- 0, \r
- linearRowFilter_caller<1 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<2 , T, D, BrdRowReflect101>,\r
- linearRowFilter_caller<3 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<4 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<5 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<6 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<7 , T, D, BrdRowReflect101>,\r
- linearRowFilter_caller<8 , T, D, BrdRowReflect101>,\r
- linearRowFilter_caller<9 , T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<10, T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<11, T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<12, T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<13, T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<14, T, D, BrdRowReflect101>,\r
- linearRowFilter_caller<15, T, D, BrdRowReflect101>, \r
- linearRowFilter_caller<16, T, D, BrdRowReflect101>\r
- },\r
- {\r
- 0, \r
- linearRowFilter_caller<1 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<2 , T, D, BrdRowReplicate>,\r
- linearRowFilter_caller<3 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<4 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<5 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<6 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<7 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<8 , T, D, BrdRowReplicate>,\r
- linearRowFilter_caller<9 , T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<10, T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<11, T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<12, T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<13, T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<14, T, D, BrdRowReplicate>,\r
- linearRowFilter_caller<15, T, D, BrdRowReplicate>, \r
- linearRowFilter_caller<16, T, D, BrdRowReplicate>\r
- },\r
- {\r
- 0, \r
- linearRowFilter_caller<1 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<2 , T, D, BrdRowConstant>,\r
- linearRowFilter_caller<3 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<4 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<5 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<6 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<7 , T, D, BrdRowConstant>, \r
- linearRowFilter_caller<8 , T, D, BrdRowConstant>,\r
- linearRowFilter_caller<9 , T, D, BrdRowConstant>,\r
- linearRowFilter_caller<10, T, D, BrdRowConstant>, \r
- linearRowFilter_caller<11, T, D, BrdRowConstant>, \r
- linearRowFilter_caller<12, T, D, BrdRowConstant>, \r
- linearRowFilter_caller<13, T, D, BrdRowConstant>,\r
- linearRowFilter_caller<14, T, D, BrdRowConstant>,\r
- linearRowFilter_caller<15, T, D, BrdRowConstant>, \r
- linearRowFilter_caller<16, T, D, BrdRowConstant>\r
- },\r
- {\r
- 0, \r
- linearRowFilter_caller<1 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<2 , T, D, BrdRowReflect>,\r
- linearRowFilter_caller<3 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<4 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<5 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<6 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<7 , T, D, BrdRowReflect>, \r
- linearRowFilter_caller<8 , T, D, BrdRowReflect>,\r
- linearRowFilter_caller<9 , T, D, BrdRowReflect>,\r
- linearRowFilter_caller<10, T, D, BrdRowReflect>, \r
- linearRowFilter_caller<11, T, D, BrdRowReflect>, \r
- linearRowFilter_caller<12, T, D, BrdRowReflect>, \r
- linearRowFilter_caller<13, T, D, BrdRowReflect>,\r
- linearRowFilter_caller<14, T, D, BrdRowReflect>,\r
- linearRowFilter_caller<15, T, D, BrdRowReflect>, \r
- linearRowFilter_caller<16, T, D, BrdRowReflect>\r
- },\r
- {\r
- 0, \r
- linearRowFilter_caller<1 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<2 , T, D, BrdRowWrap>,\r
- linearRowFilter_caller<3 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<4 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<5 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<6 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<7 , T, D, BrdRowWrap>, \r
- linearRowFilter_caller<8 , T, D, BrdRowWrap>,\r
- linearRowFilter_caller<9 , T, D, BrdRowWrap>,\r
- linearRowFilter_caller<10, T, D, BrdRowWrap>, \r
- linearRowFilter_caller<11, T, D, BrdRowWrap>, \r
- linearRowFilter_caller<12, T, D, BrdRowWrap>, \r
- linearRowFilter_caller<13, T, D, BrdRowWrap>,\r
- linearRowFilter_caller<14, T, D, BrdRowWrap>,\r
- linearRowFilter_caller<15, T, D, BrdRowWrap>, \r
- linearRowFilter_caller<16, T, D, BrdRowWrap>\r
+ typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
+ static const caller_t callers[5][17] = \r
+ {\r
+ {\r
+ 0, \r
+ linearRowFilter_caller<1 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<2 , T, D, BrdRowReflect101>,\r
+ linearRowFilter_caller<3 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<4 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<5 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<6 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<7 , T, D, BrdRowReflect101>,\r
+ linearRowFilter_caller<8 , T, D, BrdRowReflect101>,\r
+ linearRowFilter_caller<9 , T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<10, T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<11, T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<12, T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<13, T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<14, T, D, BrdRowReflect101>,\r
+ linearRowFilter_caller<15, T, D, BrdRowReflect101>, \r
+ linearRowFilter_caller<16, T, D, BrdRowReflect101>\r
+ },\r
+ {\r
+ 0, \r
+ linearRowFilter_caller<1 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<2 , T, D, BrdRowReplicate>,\r
+ linearRowFilter_caller<3 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<4 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<5 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<6 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<7 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<8 , T, D, BrdRowReplicate>,\r
+ linearRowFilter_caller<9 , T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<10, T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<11, T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<12, T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<13, T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<14, T, D, BrdRowReplicate>,\r
+ linearRowFilter_caller<15, T, D, BrdRowReplicate>, \r
+ linearRowFilter_caller<16, T, D, BrdRowReplicate>\r
+ },\r
+ {\r
+ 0, \r
+ linearRowFilter_caller<1 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<2 , T, D, BrdRowConstant>,\r
+ linearRowFilter_caller<3 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<4 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<5 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<6 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<7 , T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<8 , T, D, BrdRowConstant>,\r
+ linearRowFilter_caller<9 , T, D, BrdRowConstant>,\r
+ linearRowFilter_caller<10, T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<11, T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<12, T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<13, T, D, BrdRowConstant>,\r
+ linearRowFilter_caller<14, T, D, BrdRowConstant>,\r
+ linearRowFilter_caller<15, T, D, BrdRowConstant>, \r
+ linearRowFilter_caller<16, T, D, BrdRowConstant>\r
+ },\r
+ {\r
+ 0, \r
+ linearRowFilter_caller<1 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<2 , T, D, BrdRowReflect>,\r
+ linearRowFilter_caller<3 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<4 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<5 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<6 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<7 , T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<8 , T, D, BrdRowReflect>,\r
+ linearRowFilter_caller<9 , T, D, BrdRowReflect>,\r
+ linearRowFilter_caller<10, T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<11, T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<12, T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<13, T, D, BrdRowReflect>,\r
+ linearRowFilter_caller<14, T, D, BrdRowReflect>,\r
+ linearRowFilter_caller<15, T, D, BrdRowReflect>, \r
+ linearRowFilter_caller<16, T, D, BrdRowReflect>\r
+ },\r
+ {\r
+ 0, \r
+ linearRowFilter_caller<1 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<2 , T, D, BrdRowWrap>,\r
+ linearRowFilter_caller<3 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<4 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<5 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<6 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<7 , T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<8 , T, D, BrdRowWrap>,\r
+ linearRowFilter_caller<9 , T, D, BrdRowWrap>,\r
+ linearRowFilter_caller<10, T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<11, T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<12, T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<13, T, D, BrdRowWrap>,\r
+ linearRowFilter_caller<14, T, D, BrdRowWrap>,\r
+ linearRowFilter_caller<15, T, D, BrdRowWrap>, \r
+ linearRowFilter_caller<16, T, D, BrdRowWrap>\r
+ }\r
+ };\r
+ \r
+ loadKernel(kernel, ksize);\r
+\r
+ callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
}\r
- };\r
- \r
- loadKernel(kernel, ksize);\r
-\r
- callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
-}\r
-\r
-template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-\r
-} // namespace row_filter\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ } // namespace row_filter\r
+}}} // namespace cv { namespace gpu { namespace device\r
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)\r
#endif\r
\r
-namespace cv { namespace gpu {\r
-\r
-void error(const char *error_string, const char *file, const int line, const char *func = "");\r
-void nppError(int err, const char *file, const int line, const char *func = "");\r
-void ncvError(int err, const char *file, const int line, const char *func = "");\r
-void cufftError(int err, const char *file, const int line, const char *func = "");\r
-void cublasError(int err, const char *file, const int line, const char *func = "");\r
-\r
-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")\r
+namespace cv { namespace gpu \r
{\r
- if (cudaSuccess != err)\r
- cv::gpu::error(cudaGetErrorString(err), file, line, func);\r
-}\r
+ void error(const char *error_string, const char *file, const int line, const char *func = "");\r
+ void nppError(int err, const char *file, const int line, const char *func = "");\r
+ void ncvError(int err, const char *file, const int line, const char *func = "");\r
+ void cufftError(int err, const char *file, const int line, const char *func = "");\r
+ void cublasError(int err, const char *file, const int line, const char *func = "");\r
\r
-static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")\r
-{\r
- if (err < 0)\r
- cv::gpu::nppError(err, file, line, func);\r
-}\r
+ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")\r
+ {\r
+ if (cudaSuccess != err)\r
+ cv::gpu::error(cudaGetErrorString(err), file, line, func);\r
+ }\r
\r
-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")\r
-{\r
- if (NCV_SUCCESS != err)\r
- cv::gpu::ncvError(err, file, line, func);\r
-}\r
+ static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")\r
+ {\r
+ if (err < 0)\r
+ cv::gpu::nppError(err, file, line, func);\r
+ }\r
\r
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
-{\r
- if (CUFFT_SUCCESS != err)\r
- cv::gpu::cufftError(err, file, line, func);\r
-}\r
+ static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")\r
+ {\r
+ if (NCV_SUCCESS != err)\r
+ cv::gpu::ncvError(err, file, line, func);\r
+ }\r
\r
-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")\r
-{\r
- if (CUBLAS_STATUS_SUCCESS != err)\r
- cv::gpu::cublasError(err, file, line, func);\r
-}\r
+ static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
+ {\r
+ if (CUFFT_SUCCESS != err)\r
+ cv::gpu::cufftError(err, file, line, func);\r
+ }\r
\r
+ static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")\r
+ {\r
+ if (CUBLAS_STATUS_SUCCESS != err)\r
+ cv::gpu::cublasError(err, file, line, func);\r
+ }\r
}}\r
\r
#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
\ No newline at end of file
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace split_merge {\r
-\r
-template <typename T, size_t elem_size = sizeof(T)>\r
-struct TypeTraits \r
-{\r
- typedef T type;\r
- typedef T type2;\r
- typedef T type3;\r
- typedef T type4;\r
-};\r
-\r
-template <typename T>\r
-struct TypeTraits<T, 1>\r
-{\r
- typedef char type;\r
- typedef char2 type2;\r
- typedef char3 type3;\r
- typedef char4 type4;\r
-};\r
-\r
-template <typename T>\r
-struct TypeTraits<T, 2>\r
-{\r
- typedef short type;\r
- typedef short2 type2;\r
- typedef short3 type3;\r
- typedef short4 type4;\r
-};\r
-\r
-template <typename T>\r
-struct TypeTraits<T, 4> \r
-{\r
- typedef int type;\r
- typedef int2 type2;\r
- typedef int3 type3;\r
- typedef int4 type4;\r
-};\r
-\r
-template <typename T>\r
-struct TypeTraits<T, 8> \r
-{\r
- typedef double type;\r
- typedef double2 type2;\r
- //typedef double3 type3;\r
- //typedef double4 type3;\r
-};\r
-\r
-typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);\r
-typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
-\r
-//------------------------------------------------------------\r
-// Merge \r
-\r
-template <typename T>\r
-__global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- int rows, int cols, uchar* dst, size_t dst_step)\r
-{\r
- typedef typename TypeTraits<T>::type2 dst_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const T* src0_y = (const T*)(src0 + y * src0_step);\r
- const T* src1_y = (const T*)(src1 + y * src1_step);\r
- dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
- if (x < cols && y < rows) \r
- { \r
- dst_type dst_elem;\r
- dst_elem.x = src0_y[x];\r
- dst_elem.y = src1_y[x];\r
- dst_y[x] = dst_elem;\r
- }\r
-}\r
-\r
-\r
-template <typename T>\r
-__global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- int rows, int cols, uchar* dst, size_t dst_step)\r
-{\r
- typedef typename TypeTraits<T>::type3 dst_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const T* src0_y = (const T*)(src0 + y * src0_step);\r
- const T* src1_y = (const T*)(src1 + y * src1_step);\r
- const T* src2_y = (const T*)(src2 + y * src2_step);\r
- dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
- if (x < cols && y < rows) \r
- { \r
- dst_type dst_elem;\r
- dst_elem.x = src0_y[x];\r
- dst_elem.y = src1_y[x];\r
- dst_elem.z = src2_y[x];\r
- dst_y[x] = dst_elem;\r
- }\r
-}\r
-\r
-\r
-template <>\r
-__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- int rows, int cols, uchar* dst, size_t dst_step)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const double* src0_y = (const double*)(src0 + y * src0_step);\r
- const double* src1_y = (const double*)(src1 + y * src1_step);\r
- const double* src2_y = (const double*)(src2 + y * src2_step);\r
- double* dst_y = (double*)(dst + y * dst_step);\r
-\r
- if (x < cols && y < rows) \r
- { \r
- dst_y[3 * x] = src0_y[x];\r
- dst_y[3 * x + 1] = src1_y[x];\r
- dst_y[3 * x + 2] = src2_y[x];\r
- }\r
-}\r
-\r
-\r
-template <typename T>\r
-__global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- const uchar* src3, size_t src3_step, \r
- int rows, int cols, uchar* dst, size_t dst_step)\r
-{\r
- typedef typename TypeTraits<T>::type4 dst_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const T* src0_y = (const T*)(src0 + y * src0_step);\r
- const T* src1_y = (const T*)(src1 + y * src1_step);\r
- const T* src2_y = (const T*)(src2 + y * src2_step);\r
- const T* src3_y = (const T*)(src3 + y * src3_step);\r
- dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
- if (x < cols && y < rows) \r
- { \r
- dst_type dst_elem;\r
- dst_elem.x = src0_y[x];\r
- dst_elem.y = src1_y[x];\r
- dst_elem.z = src2_y[x];\r
- dst_elem.w = src3_y[x];\r
- dst_y[x] = dst_elem;\r
- }\r
-}\r
-\r
-\r
-template <>\r
-__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
- const uchar* src1, size_t src1_step, \r
- const uchar* src2, size_t src2_step, \r
- const uchar* src3, size_t src3_step, \r
- int rows, int cols, uchar* dst, size_t dst_step)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const double* src0_y = (const double*)(src0 + y * src0_step);\r
- const double* src1_y = (const double*)(src1 + y * src1_step);\r
- const double* src2_y = (const double*)(src2 + y * src2_step);\r
- const double* src3_y = (const double*)(src3 + y * src3_step);\r
- double2* dst_y = (double2*)(dst + y * dst_step);\r
-\r
- if (x < cols && y < rows) \r
- { \r
- dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
- dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
- }\r
-}\r
-\r
-\r
-template <typename T>\r
-static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src[0].data, src[0].step,\r
- src[1].data, src[1].step,\r
- dst.rows, dst.cols, dst.data, dst.step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-template <typename T>\r
-static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src[0].data, src[0].step,\r
- src[1].data, src[1].step,\r
- src[2].data, src[2].step,\r
- dst.rows, dst.cols, dst.data, dst.step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-template <typename T>\r
-static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
- mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src[0].data, src[0].step,\r
- src[1].data, src[1].step,\r
- src[2].data, src[2].step,\r
- src[3].data, src[3].step,\r
- dst.rows, dst.cols, dst.data, dst.step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-void merge_caller(const DevMem2Db* src, DevMem2Db& dst,\r
- int total_channels, size_t elem_size,\r
- const cudaStream_t& stream)\r
-{\r
- static MergeFunction merge_func_tbl[] =\r
- {\r
- mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
- mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
- mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
- };\r
-\r
- size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
- MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
-\r
- if (merge_func == 0)\r
- cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
-\r
- merge_func(src, dst, stream);\r
-}\r
-\r
-\r
-\r
-//------------------------------------------------------------\r
-// Split\r
-\r
-\r
-template <typename T>\r
-__global__ void splitC2_(const uchar* src, size_t src_step, \r
- int rows, int cols,\r
- uchar* dst0, size_t dst0_step,\r
- uchar* dst1, size_t dst1_step)\r
-{\r
- typedef typename TypeTraits<T>::type2 src_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const src_type* src_y = (const src_type*)(src + y * src_step);\r
- T* dst0_y = (T*)(dst0 + y * dst0_step);\r
- T* dst1_y = (T*)(dst1 + y * dst1_step);\r
-\r
- if (x < cols && y < rows) \r
- {\r
- src_type src_elem = src_y[x];\r
- dst0_y[x] = src_elem.x;\r
- dst1_y[x] = src_elem.y;\r
- }\r
-}\r
-\r
-\r
-template <typename T>\r
-__global__ void splitC3_(const uchar* src, size_t src_step, \r
- int rows, int cols,\r
- uchar* dst0, size_t dst0_step,\r
- uchar* dst1, size_t dst1_step,\r
- uchar* dst2, size_t dst2_step)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef typename TypeTraits<T>::type3 src_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const src_type* src_y = (const src_type*)(src + y * src_step);\r
- T* dst0_y = (T*)(dst0 + y * dst0_step);\r
- T* dst1_y = (T*)(dst1 + y * dst1_step);\r
- T* dst2_y = (T*)(dst2 + y * dst2_step);\r
-\r
- if (x < cols && y < rows) \r
+ namespace split_merge \r
{\r
- src_type src_elem = src_y[x];\r
- dst0_y[x] = src_elem.x;\r
- dst1_y[x] = src_elem.y;\r
- dst2_y[x] = src_elem.z;\r
- }\r
-}\r
-\r
-\r
-template <>\r
-__global__ void splitC3_<double>(\r
- const uchar* src, size_t src_step, int rows, int cols,\r
- uchar* dst0, size_t dst0_step,\r
- uchar* dst1, size_t dst1_step,\r
- uchar* dst2, size_t dst2_step)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const double* src_y = (const double*)(src + y * src_step);\r
- double* dst0_y = (double*)(dst0 + y * dst0_step);\r
- double* dst1_y = (double*)(dst1 + y * dst1_step);\r
- double* dst2_y = (double*)(dst2 + y * dst2_step);\r
-\r
- if (x < cols && y < rows) \r
- {\r
- dst0_y[x] = src_y[3 * x];\r
- dst1_y[x] = src_y[3 * x + 1];\r
- dst2_y[x] = src_y[3 * x + 2];\r
- }\r
-}\r
-\r
-\r
-template <typename T>\r
-__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
- uchar* dst0, size_t dst0_step,\r
- uchar* dst1, size_t dst1_step,\r
- uchar* dst2, size_t dst2_step,\r
- uchar* dst3, size_t dst3_step)\r
-{\r
- typedef typename TypeTraits<T>::type4 src_type;\r
-\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const src_type* src_y = (const src_type*)(src + y * src_step);\r
- T* dst0_y = (T*)(dst0 + y * dst0_step);\r
- T* dst1_y = (T*)(dst1 + y * dst1_step);\r
- T* dst2_y = (T*)(dst2 + y * dst2_step);\r
- T* dst3_y = (T*)(dst3 + y * dst3_step);\r
-\r
- if (x < cols && y < rows) \r
- {\r
- src_type src_elem = src_y[x];\r
- dst0_y[x] = src_elem.x;\r
- dst1_y[x] = src_elem.y;\r
- dst2_y[x] = src_elem.z;\r
- dst3_y[x] = src_elem.w;\r
- }\r
-}\r
-\r
-\r
-template <>\r
-__global__ void splitC4_<double>(\r
- const uchar* src, size_t src_step, int rows, int cols,\r
- uchar* dst0, size_t dst0_step,\r
- uchar* dst1, size_t dst1_step,\r
- uchar* dst2, size_t dst2_step,\r
- uchar* dst3, size_t dst3_step)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- const double2* src_y = (const double2*)(src + y * src_step);\r
- double* dst0_y = (double*)(dst0 + y * dst0_step);\r
- double* dst1_y = (double*)(dst1 + y * dst1_step);\r
- double* dst2_y = (double*)(dst2 + y * dst2_step);\r
- double* dst3_y = (double*)(dst3 + y * dst3_step);\r
-\r
- if (x < cols && y < rows) \r
- {\r
- double2 src_elem1 = src_y[2 * x];\r
- double2 src_elem2 = src_y[2 * x + 1];\r
- dst0_y[x] = src_elem1.x;\r
- dst1_y[x] = src_elem1.y;\r
- dst2_y[x] = src_elem2.x;\r
- dst3_y[x] = src_elem2.y;\r
- }\r
-}\r
-\r
-template <typename T>\r
-static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src.data, src.step, src.rows, src.cols,\r
- dst[0].data, dst[0].step,\r
- dst[1].data, dst[1].step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-template <typename T>\r
-static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src.data, src.step, src.rows, src.cols,\r
- dst[0].data, dst[0].step,\r
- dst[1].data, dst[1].step,\r
- dst[2].data, dst[2].step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-template <typename T>\r
-static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
-{\r
- dim3 blockDim(32, 8);\r
- dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
- splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
- src.data, src.step, src.rows, src.cols,\r
- dst[0].data, dst[0].step,\r
- dst[1].data, dst[1].step,\r
- dst[2].data, dst[2].step,\r
- dst[3].data, dst[3].step);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
-}\r
-\r
-\r
-void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)\r
-{\r
- static SplitFunction split_func_tbl[] =\r
- {\r
- splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
- splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
- splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
- };\r
-\r
- size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
- SplitFunction split_func = split_func_tbl[split_func_id];\r
-\r
- if (split_func == 0)\r
- cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
-\r
- split_func(src, dst, stream);\r
-}\r
-\r
-} // namespace split_merge\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T, size_t elem_size = sizeof(T)>\r
+ struct TypeTraits \r
+ {\r
+ typedef T type;\r
+ typedef T type2;\r
+ typedef T type3;\r
+ typedef T type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 1>\r
+ {\r
+ typedef char type;\r
+ typedef char2 type2;\r
+ typedef char3 type3;\r
+ typedef char4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 2>\r
+ {\r
+ typedef short type;\r
+ typedef short2 type2;\r
+ typedef short3 type3;\r
+ typedef short4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 4> \r
+ {\r
+ typedef int type;\r
+ typedef int2 type2;\r
+ typedef int3 type3;\r
+ typedef int4 type4;\r
+ };\r
+\r
+ template <typename T>\r
+ struct TypeTraits<T, 8> \r
+ {\r
+ typedef double type;\r
+ typedef double2 type2;\r
+ //typedef double3 type3;\r
+ //typedef double4 type3;\r
+ };\r
+\r
+ typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);\r
+ typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
+\r
+ //------------------------------------------------------------\r
+ // Merge \r
+\r
+ template <typename T>\r
+ __global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type2 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type3 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ const T* src2_y = (const T*)(src2 + y * src2_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_elem.z = src2_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src0_y = (const double*)(src0 + y * src0_step);\r
+ const double* src1_y = (const double*)(src1 + y * src1_step);\r
+ const double* src2_y = (const double*)(src2 + y * src2_step);\r
+ double* dst_y = (double*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_y[3 * x] = src0_y[x];\r
+ dst_y[3 * x + 1] = src1_y[x];\r
+ dst_y[3 * x + 2] = src2_y[x];\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ const uchar* src3, size_t src3_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type4 dst_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const T* src0_y = (const T*)(src0 + y * src0_step);\r
+ const T* src1_y = (const T*)(src1 + y * src1_step);\r
+ const T* src2_y = (const T*)(src2 + y * src2_step);\r
+ const T* src3_y = (const T*)(src3 + y * src3_step);\r
+ dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_type dst_elem;\r
+ dst_elem.x = src0_y[x];\r
+ dst_elem.y = src1_y[x];\r
+ dst_elem.z = src2_y[x];\r
+ dst_elem.w = src3_y[x];\r
+ dst_y[x] = dst_elem;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
+ const uchar* src1, size_t src1_step, \r
+ const uchar* src2, size_t src2_step, \r
+ const uchar* src3, size_t src3_step, \r
+ int rows, int cols, uchar* dst, size_t dst_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src0_y = (const double*)(src0 + y * src0_step);\r
+ const double* src1_y = (const double*)(src1 + y * src1_step);\r
+ const double* src2_y = (const double*)(src2 + y * src2_step);\r
+ const double* src3_y = (const double*)(src3 + y * src3_step);\r
+ double2* dst_y = (double2*)(dst + y * dst_step);\r
+\r
+ if (x < cols && y < rows) \r
+ { \r
+ dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
+ dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].data, src[0].step,\r
+ src[1].data, src[1].step,\r
+ dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].data, src[0].step,\r
+ src[1].data, src[1].step,\r
+ src[2].data, src[2].step,\r
+ dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+ mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src[0].data, src[0].step,\r
+ src[1].data, src[1].step,\r
+ src[2].data, src[2].step,\r
+ src[3].data, src[3].step,\r
+ dst.rows, dst.cols, dst.data, dst.step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ void merge_caller(const DevMem2Db* src, DevMem2Db& dst,\r
+ int total_channels, size_t elem_size,\r
+ const cudaStream_t& stream)\r
+ {\r
+ static MergeFunction merge_func_tbl[] =\r
+ {\r
+ mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
+ mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
+ mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
+ };\r
+\r
+ size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
+ MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
+\r
+ if (merge_func == 0)\r
+ cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
+\r
+ merge_func(src, dst, stream);\r
+ }\r
+\r
+\r
+\r
+ //------------------------------------------------------------\r
+ // Split\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC2_(const uchar* src, size_t src_step, \r
+ int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type2 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC3_(const uchar* src, size_t src_step, \r
+ int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type3 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+ T* dst2_y = (T*)(dst2 + y * dst2_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ dst2_y[x] = src_elem.z;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void splitC3_<double>(\r
+ const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double* src_y = (const double*)(src + y * src_step);\r
+ double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+ double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+ double* dst2_y = (double*)(dst2 + y * dst2_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ dst0_y[x] = src_y[3 * x];\r
+ dst1_y[x] = src_y[3 * x + 1];\r
+ dst2_y[x] = src_y[3 * x + 2];\r
+ }\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step,\r
+ uchar* dst3, size_t dst3_step)\r
+ {\r
+ typedef typename TypeTraits<T>::type4 src_type;\r
+\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const src_type* src_y = (const src_type*)(src + y * src_step);\r
+ T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+ T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+ T* dst2_y = (T*)(dst2 + y * dst2_step);\r
+ T* dst3_y = (T*)(dst3 + y * dst3_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ src_type src_elem = src_y[x];\r
+ dst0_y[x] = src_elem.x;\r
+ dst1_y[x] = src_elem.y;\r
+ dst2_y[x] = src_elem.z;\r
+ dst3_y[x] = src_elem.w;\r
+ }\r
+ }\r
+\r
+\r
+ template <>\r
+ __global__ void splitC4_<double>(\r
+ const uchar* src, size_t src_step, int rows, int cols,\r
+ uchar* dst0, size_t dst0_step,\r
+ uchar* dst1, size_t dst1_step,\r
+ uchar* dst2, size_t dst2_step,\r
+ uchar* dst3, size_t dst3_step)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ const double2* src_y = (const double2*)(src + y * src_step);\r
+ double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+ double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+ double* dst2_y = (double*)(dst2 + y * dst2_step);\r
+ double* dst3_y = (double*)(dst3 + y * dst3_step);\r
+\r
+ if (x < cols && y < rows) \r
+ {\r
+ double2 src_elem1 = src_y[2 * x];\r
+ double2 src_elem2 = src_y[2 * x + 1];\r
+ dst0_y[x] = src_elem1.x;\r
+ dst1_y[x] = src_elem1.y;\r
+ dst2_y[x] = src_elem2.x;\r
+ dst3_y[x] = src_elem2.y;\r
+ }\r
+ }\r
+\r
+ template <typename T>\r
+ static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.data, src.step, src.rows, src.cols,\r
+ dst[0].data, dst[0].step,\r
+ dst[1].data, dst[1].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.data, src.step, src.rows, src.cols,\r
+ dst[0].data, dst[0].step,\r
+ dst[1].data, dst[1].step,\r
+ dst[2].data, dst[2].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+ {\r
+ dim3 blockDim(32, 8);\r
+ dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+ splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+ src.data, src.step, src.rows, src.cols,\r
+ dst[0].data, dst[0].step,\r
+ dst[1].data, dst[1].step,\r
+ dst[2].data, dst[2].step,\r
+ dst[3].data, dst[3].step);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)\r
+ {\r
+ static SplitFunction split_func_tbl[] =\r
+ {\r
+ splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
+ splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
+ splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
+ };\r
+\r
+ size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
+ SplitFunction split_func = split_func_tbl[split_func_id];\r
+\r
+ if (split_func == 0)\r
+ cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
+\r
+ split_func(src, dst, stream);\r
+ }\r
+ } // namespace split_merge\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereobm {\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
-/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ namespace stereobm \r
+ {\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
+ /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
\r
-#define ROWSperTHREAD 21 // the number of rows a thread will process\r
+ #define ROWSperTHREAD 21 // the number of rows a thread will process\r
\r
-#define BLOCK_W 128 // the thread block width (464)\r
-#define N_DISPARITIES 8\r
+ #define BLOCK_W 128 // the thread block width (464)\r
+ #define N_DISPARITIES 8\r
\r
-#define STEREO_MIND 0 // The minimum d range to check\r
-#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing\r
+ #define STEREO_MIND 0 // The minimum d range to check\r
+ #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing\r
\r
-__constant__ unsigned int* cminSSDImage;\r
-__constant__ size_t cminSSD_step;\r
-__constant__ int cwidth;\r
-__constant__ int cheight;\r
+ __constant__ unsigned int* cminSSDImage;\r
+ __constant__ size_t cminSSD_step;\r
+ __constant__ int cwidth;\r
+ __constant__ int cheight;\r
\r
-__device__ __forceinline__ int SQ(int a)\r
-{\r
- return a * a;\r
-}\r
+ __device__ __forceinline__ int SQ(int a)\r
+ {\r
+ return a * a;\r
+ }\r
\r
-template<int RADIUS>\r
-__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)\r
-{ \r
- unsigned int cache = 0;\r
- unsigned int cache2 = 0;\r
+ template<int RADIUS>\r
+ __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)\r
+ { \r
+ unsigned int cache = 0;\r
+ unsigned int cache2 = 0;\r
\r
- for(int i = 1; i <= RADIUS; i++)\r
- cache += col_ssd[i];\r
+ for(int i = 1; i <= RADIUS; i++)\r
+ cache += col_ssd[i];\r
\r
- col_ssd_cache[0] = cache;\r
+ col_ssd_cache[0] = cache;\r
\r
- __syncthreads();\r
+ __syncthreads();\r
\r
- if (threadIdx.x < BLOCK_W - RADIUS)\r
- cache2 = col_ssd_cache[RADIUS];\r
- else\r
- for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)\r
- cache2 += col_ssd[i];\r
+ if (threadIdx.x < BLOCK_W - RADIUS)\r
+ cache2 = col_ssd_cache[RADIUS];\r
+ else\r
+ for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)\r
+ cache2 += col_ssd[i];\r
\r
- return col_ssd[0] + cache + cache2;\r
-}\r
+ return col_ssd[0] + cache + cache2;\r
+ }\r
\r
-template<int RADIUS>\r
-__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)\r
-{\r
- unsigned int ssd[N_DISPARITIES];\r
-\r
- //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
- ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));\r
- __syncthreads();\r
- ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));\r
-\r
- int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));\r
-\r
- int bestIdx = 0;\r
- for (int i = 0; i < N_DISPARITIES; i++)\r
- {\r
- if (mssd == ssd[i])\r
- bestIdx = i;\r
- }\r
+ template<int RADIUS>\r
+ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)\r
+ {\r
+ unsigned int ssd[N_DISPARITIES];\r
\r
- return make_uint2(mssd, bestIdx);\r
-}\r
+ //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
+ ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));\r
+ __syncthreads();\r
+ ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));\r
\r
-template<int RADIUS>\r
-__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)\r
-{\r
- unsigned char leftPixel1;\r
- unsigned char leftPixel2;\r
- unsigned char rightPixel1[8];\r
- unsigned char rightPixel2[8];\r
- unsigned int diff1, diff2;\r
-\r
- leftPixel1 = imageL[idx1];\r
- leftPixel2 = imageL[idx2];\r
-\r
- idx1 = idx1 - d;\r
- idx2 = idx2 - d;\r
-\r
- rightPixel1[7] = imageR[idx1 - 7];\r
- rightPixel1[0] = imageR[idx1 - 0];\r
- rightPixel1[1] = imageR[idx1 - 1];\r
- rightPixel1[2] = imageR[idx1 - 2];\r
- rightPixel1[3] = imageR[idx1 - 3];\r
- rightPixel1[4] = imageR[idx1 - 4];\r
- rightPixel1[5] = imageR[idx1 - 5];\r
- rightPixel1[6] = imageR[idx1 - 6];\r
-\r
- rightPixel2[7] = imageR[idx2 - 7];\r
- rightPixel2[0] = imageR[idx2 - 0];\r
- rightPixel2[1] = imageR[idx2 - 1];\r
- rightPixel2[2] = imageR[idx2 - 2];\r
- rightPixel2[3] = imageR[idx2 - 3];\r
- rightPixel2[4] = imageR[idx2 - 4];\r
- rightPixel2[5] = imageR[idx2 - 5];\r
- rightPixel2[6] = imageR[idx2 - 6];\r
-\r
- //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
- diff1 = leftPixel1 - rightPixel1[0];\r
- diff2 = leftPixel2 - rightPixel2[0];\r
- col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[1];\r
- diff2 = leftPixel2 - rightPixel2[1];\r
- col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[2];\r
- diff2 = leftPixel2 - rightPixel2[2];\r
- col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[3];\r
- diff2 = leftPixel2 - rightPixel2[3];\r
- col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[4];\r
- diff2 = leftPixel2 - rightPixel2[4];\r
- col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[5];\r
- diff2 = leftPixel2 - rightPixel2[5];\r
- col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[6];\r
- diff2 = leftPixel2 - rightPixel2[6];\r
- col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-\r
- diff1 = leftPixel1 - rightPixel1[7];\r
- diff2 = leftPixel2 - rightPixel2[7];\r
- col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
-}\r
-\r
-template<int RADIUS>\r
-__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)\r
-{\r
- unsigned char leftPixel1;\r
- int idx;\r
- unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};\r
+ int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));\r
\r
- for(int i = 0; i < (2 * RADIUS + 1); i++)\r
- {\r
- idx = y_tex * im_pitch + x_tex;\r
- leftPixel1 = imageL[idx];\r
- idx = idx - d;\r
-\r
- diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);\r
- diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);\r
- diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);\r
- diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);\r
- diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);\r
- diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);\r
- diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);\r
- diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);\r
-\r
- y_tex += 1;\r
- }\r
- //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
- col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];\r
- col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];\r
- col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];\r
- col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];\r
- col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];\r
- col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];\r
- col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];\r
- col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];\r
-}\r
-\r
-template<int RADIUS>\r
-__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)\r
-{\r
- extern __shared__ unsigned int col_ssd_cache[];\r
- volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;\r
- volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)\r
-\r
- //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)\r
- int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);\r
- //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)\r
- #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)\r
- //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;\r
-\r
- unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;\r
- unsigned char* disparImage = disp.data + X + Y * disp.step;\r
- /* if (X < cwidth)\r
- {\r
- unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;\r
- for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )\r
- *ptr = 0xFFFFFFFF;\r
- }*/\r
- int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);\r
- int y_tex;\r
- int x_tex = X - RADIUS;\r
-\r
- if (x_tex >= cwidth)\r
- return;\r
-\r
- for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)\r
- {\r
- y_tex = Y - RADIUS;\r
-\r
- InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);\r
+ int bestIdx = 0;\r
+ for (int i = 0; i < N_DISPARITIES; i++)\r
+ {\r
+ if (mssd == ssd[i])\r
+ bestIdx = i;\r
+ }\r
\r
- if (col_ssd_extra > 0)\r
- if (x_tex + BLOCK_W < cwidth)\r
- InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);\r
+ return make_uint2(mssd, bestIdx);\r
+ }\r
\r
- __syncthreads(); //before MinSSD function\r
+ template<int RADIUS>\r
+ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)\r
+ {\r
+ unsigned char leftPixel1;\r
+ unsigned char leftPixel2;\r
+ unsigned char rightPixel1[8];\r
+ unsigned char rightPixel2[8];\r
+ unsigned int diff1, diff2;\r
+\r
+ leftPixel1 = imageL[idx1];\r
+ leftPixel2 = imageL[idx2];\r
+\r
+ idx1 = idx1 - d;\r
+ idx2 = idx2 - d;\r
+\r
+ rightPixel1[7] = imageR[idx1 - 7];\r
+ rightPixel1[0] = imageR[idx1 - 0];\r
+ rightPixel1[1] = imageR[idx1 - 1];\r
+ rightPixel1[2] = imageR[idx1 - 2];\r
+ rightPixel1[3] = imageR[idx1 - 3];\r
+ rightPixel1[4] = imageR[idx1 - 4];\r
+ rightPixel1[5] = imageR[idx1 - 5];\r
+ rightPixel1[6] = imageR[idx1 - 6];\r
+\r
+ rightPixel2[7] = imageR[idx2 - 7];\r
+ rightPixel2[0] = imageR[idx2 - 0];\r
+ rightPixel2[1] = imageR[idx2 - 1];\r
+ rightPixel2[2] = imageR[idx2 - 2];\r
+ rightPixel2[3] = imageR[idx2 - 3];\r
+ rightPixel2[4] = imageR[idx2 - 4];\r
+ rightPixel2[5] = imageR[idx2 - 5];\r
+ rightPixel2[6] = imageR[idx2 - 6];\r
+\r
+ //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
+ diff1 = leftPixel1 - rightPixel1[0];\r
+ diff2 = leftPixel2 - rightPixel2[0];\r
+ col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[1];\r
+ diff2 = leftPixel2 - rightPixel2[1];\r
+ col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[2];\r
+ diff2 = leftPixel2 - rightPixel2[2];\r
+ col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[3];\r
+ diff2 = leftPixel2 - rightPixel2[3];\r
+ col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[4];\r
+ diff2 = leftPixel2 - rightPixel2[4];\r
+ col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[5];\r
+ diff2 = leftPixel2 - rightPixel2[5];\r
+ col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[6];\r
+ diff2 = leftPixel2 - rightPixel2[6];\r
+ col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+\r
+ diff1 = leftPixel1 - rightPixel1[7];\r
+ diff2 = leftPixel2 - rightPixel2[7];\r
+ col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);\r
+ }\r
\r
- if (X < cwidth - RADIUS && Y < cheight - RADIUS)\r
+ template<int RADIUS>\r
+ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)\r
{\r
- uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);\r
- if (minSSD.x < minSSDImage[0])\r
+ unsigned char leftPixel1;\r
+ int idx;\r
+ unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};\r
+\r
+ for(int i = 0; i < (2 * RADIUS + 1); i++)\r
{\r
- disparImage[0] = (unsigned char)(d + minSSD.y);\r
- minSSDImage[0] = minSSD.x;\r
+ idx = y_tex * im_pitch + x_tex;\r
+ leftPixel1 = imageL[idx];\r
+ idx = idx - d;\r
+\r
+ diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);\r
+ diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);\r
+ diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);\r
+ diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);\r
+ diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);\r
+ diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);\r
+ diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);\r
+ diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);\r
+\r
+ y_tex += 1;\r
}\r
+ //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
+ col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];\r
+ col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];\r
+ col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];\r
+ col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];\r
+ col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];\r
+ col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];\r
+ col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];\r
+ col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];\r
}\r
\r
- for(int row = 1; row < end_row; row++)\r
+ template<int RADIUS>\r
+ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)\r
{\r
- int idx1 = y_tex * img_step + x_tex;\r
- int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;\r
-\r
- __syncthreads();\r
-\r
- StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);\r
+ extern __shared__ unsigned int col_ssd_cache[];\r
+ volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;\r
+ volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)\r
+\r
+ //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)\r
+ int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);\r
+ //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)\r
+ #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)\r
+ //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;\r
+\r
+ unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;\r
+ unsigned char* disparImage = disp.data + X + Y * disp.step;\r
+ /* if (X < cwidth)\r
+ {\r
+ unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;\r
+ for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )\r
+ *ptr = 0xFFFFFFFF;\r
+ }*/\r
+ int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);\r
+ int y_tex;\r
+ int x_tex = X - RADIUS;\r
+\r
+ if (x_tex >= cwidth)\r
+ return;\r
+\r
+ for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)\r
+ {\r
+ y_tex = Y - RADIUS;\r
\r
- if (col_ssd_extra)\r
- if (x_tex + BLOCK_W < cwidth)\r
- StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);\r
+ InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);\r
\r
- y_tex += 1;\r
+ if (col_ssd_extra > 0)\r
+ if (x_tex + BLOCK_W < cwidth)\r
+ InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);\r
\r
- __syncthreads(); //before MinSSD function\r
+ __syncthreads(); //before MinSSD function\r
\r
- if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)\r
- {\r
- int idx = row * cminSSD_step;\r
- uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);\r
- if (minSSD.x < minSSDImage[idx])\r
+ if (X < cwidth - RADIUS && Y < cheight - RADIUS)\r
{\r
- disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);\r
- minSSDImage[idx] = minSSD.x;\r
+ uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);\r
+ if (minSSD.x < minSSDImage[0])\r
+ {\r
+ disparImage[0] = (unsigned char)(d + minSSD.y);\r
+ minSSDImage[0] = minSSD.x;\r
+ }\r
}\r
- }\r
- } // for row loop\r
- } // for d loop\r
-}\r
\r
+ for(int row = 1; row < end_row; row++)\r
+ {\r
+ int idx1 = y_tex * img_step + x_tex;\r
+ int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;\r
\r
-template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)\r
-{\r
- dim3 grid(1,1,1);\r
- dim3 threads(BLOCK_W, 1, 1);\r
+ __syncthreads();\r
\r
- grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);\r
- grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);\r
+ StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);\r
\r
- //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
- size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);\r
+ if (col_ssd_extra)\r
+ if (x_tex + BLOCK_W < cwidth)\r
+ StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);\r
\r
- stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);\r
- cudaSafeCall( cudaGetLastError() );\r
+ y_tex += 1;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-};\r
+ __syncthreads(); //before MinSSD function\r
\r
-typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);\r
+ if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)\r
+ {\r
+ int idx = row * cminSSD_step;\r
+ uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);\r
+ if (minSSD.x < minSSDImage[idx])\r
+ {\r
+ disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);\r
+ minSSDImage[idx] = minSSD.x;\r
+ }\r
+ }\r
+ } // for row loop\r
+ } // for d loop\r
+ }\r
\r
-const static kernel_caller_t callers[] =\r
-{\r
- 0,\r
- kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,\r
- kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,\r
- kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,\r
- kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,\r
- kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>\r
-\r
- //0,0,0, 0,0,0, 0,0,kernel_caller<9>\r
-};\r
-const int calles_num = sizeof(callers)/sizeof(callers[0]);\r
-\r
-void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)\r
-{\r
- int winsz2 = winsz >> 1;\r
\r
- if (winsz2 == 0 || winsz2 >= calles_num)\r
- cv::gpu::error("Unsupported window size", __FILE__, __LINE__);\r
+ template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)\r
+ {\r
+ dim3 grid(1,1,1);\r
+ dim3 threads(BLOCK_W, 1, 1);\r
\r
- //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );\r
- //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );\r
+ grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);\r
+ grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);\r
\r
- cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );\r
- cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );\r
+ //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)\r
+ size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);\r
\r
- cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );\r
- cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );\r
- cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );\r
+ stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();\r
- cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ };\r
\r
- callers[winsz2](left, right, disp, maxdisp, stream);\r
-}\r
+ typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);\r
\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
-/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
+ const static kernel_caller_t callers[] =\r
+ {\r
+ 0,\r
+ kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,\r
+ kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,\r
+ kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,\r
+ kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,\r
+ kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>\r
+\r
+ //0,0,0, 0,0,0, 0,0,kernel_caller<9>\r
+ };\r
+ const int calles_num = sizeof(callers)/sizeof(callers[0]);\r
+\r
+ void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)\r
+ {\r
+ int winsz2 = winsz >> 1;\r
\r
-texture<unsigned char, 2, cudaReadModeElementType> texForSobel;\r
+ if (winsz2 == 0 || winsz2 >= calles_num)\r
+ cv::gpu::error("Unsupported window size", __FILE__, __LINE__);\r
\r
-__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)\r
-{\r
- int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+ //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );\r
+ //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );\r
\r
- if (x < output.cols && y < output.rows)\r
- {\r
- int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +\r
- (int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +\r
- (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);\r
+ cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );\r
+ cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );\r
\r
+ cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );\r
+ cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );\r
+ cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );\r
\r
- conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);\r
- output.ptr(y)[x] = conv & 0xFF;\r
- }\r
-}\r
+ size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();\r
+ cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );\r
\r
-void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)\r
-{\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();\r
- cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );\r
+ callers[winsz2](left, right, disp, maxdisp, stream);\r
+ }\r
\r
- dim3 threads(16, 16, 1);\r
- dim3 grid(1, 1, 1);\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
+ /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
\r
- grid.x = divUp(input.cols, threads.x);\r
- grid.y = divUp(input.rows, threads.y);\r
+ texture<unsigned char, 2, cudaReadModeElementType> texForSobel;\r
\r
- prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)\r
+ {\r
+ int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ int y = blockDim.y * blockIdx.y + threadIdx.y;\r
\r
- if (stream == 0) \r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
+ if (x < output.cols && y < output.rows)\r
+ {\r
+ int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +\r
+ (int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +\r
+ (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);\r
\r
- cudaSafeCall( cudaUnbindTexture (texForSobel ) );\r
-}\r
\r
+ conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);\r
+ output.ptr(y)[x] = conv & 0xFF;\r
+ }\r
+ }\r
\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
-/////////////////////////////////// Textureness filtering ////////////////////////////////////////\r
-//////////////////////////////////////////////////////////////////////////////////////////////////\r
+ void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)\r
+ {\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();\r
+ cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );\r
\r
-texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;\r
+ dim3 threads(16, 16, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
-__device__ __forceinline__ float sobel(int x, int y)\r
-{\r
- float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +\r
- tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +\r
- tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);\r
- return fabs(conv);\r
-}\r
+ grid.x = divUp(input.cols, threads.x);\r
+ grid.y = divUp(input.rows, threads.y);\r
\r
-__device__ float CalcSums(float *cols, float *cols_cache, int winsz)\r
-{\r
- float cache = 0;\r
- float cache2 = 0;\r
- int winsz2 = winsz/2;\r
+ prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- for(int i = 1; i <= winsz2; i++)\r
- cache += cols[i];\r
+ if (stream == 0) \r
+ cudaSafeCall( cudaDeviceSynchronize() ); \r
\r
- cols_cache[0] = cache;\r
+ cudaSafeCall( cudaUnbindTexture (texForSobel ) );\r
+ }\r
\r
- __syncthreads();\r
\r
- if (threadIdx.x < blockDim.x - winsz2)\r
- cache2 = cols_cache[winsz2];\r
- else\r
- for(int i = winsz2 + 1; i < winsz; i++)\r
- cache2 += cols[i];\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
+ /////////////////////////////////// Textureness filtering ////////////////////////////////////////\r
+ //////////////////////////////////////////////////////////////////////////////////////////////////\r
\r
- return cols[0] + cache + cache2;\r
-}\r
+ texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;\r
\r
-#define RpT (2 * ROWSperTHREAD) // got experimentally\r
+ __device__ __forceinline__ float sobel(int x, int y)\r
+ {\r
+ float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +\r
+ tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +\r
+ tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);\r
+ return fabs(conv);\r
+ }\r
\r
-__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)\r
-{\r
- int winsz2 = winsz/2;\r
- int n_dirty_pixels = (winsz2) * 2;\r
+ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)\r
+ {\r
+ float cache = 0;\r
+ float cache2 = 0;\r
+ int winsz2 = winsz/2;\r
\r
- extern __shared__ float cols_cache[];\r
- float *cols = cols_cache + blockDim.x + threadIdx.x;\r
- float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;\r
+ for(int i = 1; i <= winsz2; i++)\r
+ cache += cols[i];\r
\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int beg_row = blockIdx.y * RpT;\r
- int end_row = ::min(beg_row + RpT, disp.rows);\r
+ cols_cache[0] = cache;\r
\r
- if (x < disp.cols)\r
- {\r
- int y = beg_row;\r
+ __syncthreads();\r
\r
- float sum = 0;\r
- float sum_extra = 0;\r
+ if (threadIdx.x < blockDim.x - winsz2)\r
+ cache2 = cols_cache[winsz2];\r
+ else\r
+ for(int i = winsz2 + 1; i < winsz; i++)\r
+ cache2 += cols[i];\r
\r
- for(int i = y - winsz2; i <= y + winsz2; ++i)\r
- {\r
- sum += sobel(x - winsz2, i);\r
- if (cols_extra)\r
- sum_extra += sobel(x + blockDim.x - winsz2, i);\r
+ return cols[0] + cache + cache2;\r
}\r
- *cols = sum;\r
- if (cols_extra)\r
- *cols_extra = sum_extra;\r
\r
- __syncthreads();\r
+ #define RpT (2 * ROWSperTHREAD) // got experimentally\r
\r
- float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;\r
- if (sum_win < threshold)\r
- disp.data[y * disp.step + x] = 0;\r
+ __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)\r
+ {\r
+ int winsz2 = winsz/2;\r
+ int n_dirty_pixels = (winsz2) * 2;\r
\r
- __syncthreads();\r
+ extern __shared__ float cols_cache[];\r
+ float *cols = cols_cache + blockDim.x + threadIdx.x;\r
+ float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;\r
\r
- for(int y = beg_row + 1; y < end_row; ++y)\r
- {\r
- sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);\r
- *cols = sum;\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int beg_row = blockIdx.y * RpT;\r
+ int end_row = ::min(beg_row + RpT, disp.rows);\r
\r
- if (cols_extra)\r
+ if (x < disp.cols)\r
{\r
- sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);\r
- *cols_extra = sum_extra;\r
- }\r
+ int y = beg_row;\r
\r
- __syncthreads();\r
- float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;\r
- if (sum_win < threshold)\r
- disp.data[y * disp.step + x] = 0;\r
+ float sum = 0;\r
+ float sum_extra = 0;\r
\r
- __syncthreads();\r
- }\r
- }\r
-}\r
+ for(int i = y - winsz2; i <= y + winsz2; ++i)\r
+ {\r
+ sum += sobel(x - winsz2, i);\r
+ if (cols_extra)\r
+ sum_extra += sobel(x + blockDim.x - winsz2, i);\r
+ }\r
+ *cols = sum;\r
+ if (cols_extra)\r
+ *cols_extra = sum_extra;\r
\r
-void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)\r
-{\r
- avgTexturenessThreshold *= winsz * winsz;\r
+ __syncthreads();\r
\r
- texForTF.filterMode = cudaFilterModeLinear;\r
- texForTF.addressMode[0] = cudaAddressModeWrap;\r
- texForTF.addressMode[1] = cudaAddressModeWrap;\r
+ float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;\r
+ if (sum_win < threshold)\r
+ disp.data[y * disp.step + x] = 0;\r
\r
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();\r
- cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );\r
+ __syncthreads();\r
\r
- dim3 threads(128, 1, 1);\r
- dim3 grid(1, 1, 1);\r
+ for(int y = beg_row + 1; y < end_row; ++y)\r
+ {\r
+ sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);\r
+ *cols = sum;\r
\r
- grid.x = divUp(input.cols, threads.x);\r
- grid.y = divUp(input.rows, RpT);\r
+ if (cols_extra)\r
+ {\r
+ sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);\r
+ *cols_extra = sum_extra;\r
+ }\r
\r
- size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);\r
- textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __syncthreads();\r
+ float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;\r
+ if (sum_win < threshold)\r
+ disp.data[y * disp.step + x] = 0;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ __syncthreads();\r
+ }\r
+ }\r
+ }\r
+\r
+ void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)\r
+ {\r
+ avgTexturenessThreshold *= winsz * winsz;\r
\r
- cudaSafeCall( cudaUnbindTexture (texForTF) );\r
-}\r
+ texForTF.filterMode = cudaFilterModeLinear;\r
+ texForTF.addressMode[0] = cudaAddressModeWrap;\r
+ texForTF.addressMode[1] = cudaAddressModeWrap;\r
\r
-} // namespace stereobm\r
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();\r
+ cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ dim3 threads(128, 1, 1);\r
+ dim3 grid(1, 1, 1);\r
+\r
+ grid.x = divUp(input.cols, threads.x);\r
+ grid.y = divUp(input.rows, RpT);\r
+\r
+ size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);\r
+ textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+ cudaSafeCall( cudaUnbindTexture (texForTF) );\r
+ }\r
+ } // namespace stereobm\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereobp {\r
-\r
-///////////////////////////////////////////////////////////////\r
-/////////////////////// load constants ////////////////////////\r
-///////////////////////////////////////////////////////////////\r
-\r
-__constant__ int cndisp;\r
-__constant__ float cmax_data_term;\r
-__constant__ float cdata_weight;\r
-__constant__ float cmax_disc_term;\r
-__constant__ float cdisc_single_jump;\r
-\r
-void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
-}\r
-\r
-///////////////////////////////////////////////////////////////\r
-////////////////////////// comp data //////////////////////////\r
-///////////////////////////////////////////////////////////////\r
-\r
-template <int cn> struct PixDiff;\r
-template <> struct PixDiff<1>\r
-{\r
- __device__ __forceinline__ PixDiff(const uchar* ls)\r
- {\r
- l = *ls;\r
- }\r
- __device__ __forceinline__ float operator()(const uchar* rs) const\r
- {\r
- return ::abs((int)l - *rs);\r
- }\r
- uchar l;\r
-};\r
-template <> struct PixDiff<3>\r
-{\r
- __device__ __forceinline__ PixDiff(const uchar* ls)\r
- {\r
- l = *((uchar3*)ls);\r
- }\r
- __device__ __forceinline__ float operator()(const uchar* rs) const\r
- {\r
- const float tr = 0.299f;\r
- const float tg = 0.587f;\r
- const float tb = 0.114f;\r
-\r
- float val = tb * ::abs((int)l.x - rs[0]);\r
- val += tg * ::abs((int)l.y - rs[1]);\r
- val += tr * ::abs((int)l.z - rs[2]);\r
-\r
- return val;\r
- }\r
- uchar3 l;\r
-};\r
-template <> struct PixDiff<4>\r
+namespace cv { namespace gpu { namespace device \r
{\r
- __device__ __forceinline__ PixDiff(const uchar* ls)\r
+ namespace stereobp \r
{\r
- l = *((uchar4*)ls);\r
- }\r
- __device__ __forceinline__ float operator()(const uchar* rs) const\r
- {\r
- const float tr = 0.299f;\r
- const float tg = 0.587f;\r
- const float tb = 0.114f;\r
-\r
- uchar4 r = *((uchar4*)rs);\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////////// load constants ////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- float val = tb * ::abs((int)l.x - r.x);\r
- val += tg * ::abs((int)l.y - r.y);\r
- val += tr * ::abs((int)l.z - r.z);\r
+ __constant__ int cndisp;\r
+ __constant__ float cmax_data_term;\r
+ __constant__ float cdata_weight;\r
+ __constant__ float cmax_disc_term;\r
+ __constant__ float cdisc_single_jump;\r
\r
- return val;\r
- }\r
- uchar4 l;\r
-};\r
+ void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
+ }\r
\r
-template <int cn, typename D>\r
-__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ ///////////////////////////////////////////////////////////////\r
+ ////////////////////////// comp data //////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)\r
- {\r
- const uchar* ls = left.ptr(y) + x * cn;\r
- const PixDiff<cn> pixDiff(ls);\r
- const uchar* rs = right.ptr(y) + x * cn;\r
+ template <int cn> struct PixDiff;\r
+ template <> struct PixDiff<1>\r
+ {\r
+ __device__ __forceinline__ PixDiff(const uchar* ls)\r
+ {\r
+ l = *ls;\r
+ }\r
+ __device__ __forceinline__ float operator()(const uchar* rs) const\r
+ {\r
+ return ::abs((int)l - *rs);\r
+ }\r
+ uchar l;\r
+ };\r
+ template <> struct PixDiff<3>\r
+ {\r
+ __device__ __forceinline__ PixDiff(const uchar* ls)\r
+ {\r
+ l = *((uchar3*)ls);\r
+ }\r
+ __device__ __forceinline__ float operator()(const uchar* rs) const\r
+ {\r
+ const float tr = 0.299f;\r
+ const float tg = 0.587f;\r
+ const float tb = 0.114f;\r
\r
- D* ds = data.ptr(y) + x;\r
- const size_t disp_step = data.step * left.rows;\r
+ float val = tb * ::abs((int)l.x - rs[0]);\r
+ val += tg * ::abs((int)l.y - rs[1]);\r
+ val += tr * ::abs((int)l.z - rs[2]);\r
\r
- for (int disp = 0; disp < cndisp; disp++)\r
+ return val;\r
+ }\r
+ uchar3 l;\r
+ };\r
+ template <> struct PixDiff<4>\r
{\r
- if (x - disp >= 1)\r
+ __device__ __forceinline__ PixDiff(const uchar* ls)\r
{\r
- float val = pixDiff(rs - disp * cn);\r
+ l = *((uchar4*)ls);\r
+ }\r
+ __device__ __forceinline__ float operator()(const uchar* rs) const\r
+ {\r
+ const float tr = 0.299f;\r
+ const float tg = 0.587f;\r
+ const float tb = 0.114f;\r
+\r
+ uchar4 r = *((uchar4*)rs);\r
\r
- ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));\r
+ float val = tb * ::abs((int)l.x - r.x);\r
+ val += tg * ::abs((int)l.y - r.y);\r
+ val += tr * ::abs((int)l.z - r.z);\r
+\r
+ return val;\r
}\r
- else\r
+ uchar4 l;\r
+ };\r
+\r
+ template <int cn, typename D>\r
+ __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)\r
{\r
- ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);\r
+ const uchar* ls = left.ptr(y) + x * cn;\r
+ const PixDiff<cn> pixDiff(ls);\r
+ const uchar* rs = right.ptr(y) + x * cn;\r
+\r
+ D* ds = data.ptr(y) + x;\r
+ const size_t disp_step = data.step * left.rows;\r
+\r
+ for (int disp = 0; disp < cndisp; disp++)\r
+ {\r
+ if (x - disp >= 1)\r
+ {\r
+ float val = pixDiff(rs - disp * cn);\r
+\r
+ ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));\r
+ }\r
+ else\r
+ {\r
+ ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);\r
+ }\r
+ }\r
}\r
}\r
- }\r
-}\r
-\r
-template<typename T, typename D>\r
-void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
\r
-template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ template<typename T, typename D>\r
+ void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- grid.x = divUp(left.cols, threads.x);\r
- grid.y = divUp(left.rows, threads.y);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
- cudaSafeCall( cudaGetLastError() );\r
+ grid.x = divUp(left.cols, threads.x);\r
+ grid.y = divUp(left.rows, threads.y);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-///////////////////////////////////////////////////////////////\r
-//////////////////////// data step down ///////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template <typename T>\r
-__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ ///////////////////////////////////////////////////////////////\r
+ //////////////////////// data step down ///////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- if (x < dst_cols && y < dst_rows)\r
- {\r
- for (int d = 0; d < cndisp; ++d)\r
+ template <typename T>\r
+ __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)\r
{\r
- float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];\r
- dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];\r
- dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];\r
- dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);\r
+ if (x < dst_cols && y < dst_rows)\r
+ {\r
+ for (int d = 0; d < cndisp; ++d)\r
+ {\r
+ float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];\r
+ dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];\r
+ dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];\r
+ dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];\r
+\r
+ dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);\r
+ }\r
+ }\r
}\r
- }\r
-}\r
\r
-template<typename T>\r
-void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ template<typename T>\r
+ void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- grid.x = divUp(dst_cols, threads.x);\r
- grid.y = divUp(dst_rows, threads.y);\r
+ grid.x = divUp(dst_cols, threads.x);\r
+ grid.y = divUp(dst_rows, threads.y);\r
\r
- data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);\r
- cudaSafeCall( cudaGetLastError() );\r
+ data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
-template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+ template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
\r
-///////////////////////////////////////////////////////////////\r
-/////////////////// level up messages ////////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////// level up messages ////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
-template <typename T>\r
-__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ template <typename T>\r
+ __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)\r
+ {\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (x < dst_cols && y < dst_rows)\r
- {\r
- const size_t dst_disp_step = dst.step * dst_rows;\r
- const size_t src_disp_step = src.step * src_rows;\r
+ if (x < dst_cols && y < dst_rows)\r
+ {\r
+ const size_t dst_disp_step = dst.step * dst_rows;\r
+ const size_t src_disp_step = src.step * src_rows;\r
\r
- T* dstr = dst.ptr(y ) + x;\r
- const T* srcr = src.ptr(y/2) + x/2;\r
+ T* dstr = dst.ptr(y ) + x;\r
+ const T* srcr = src.ptr(y/2) + x/2;\r
\r
- for (int d = 0; d < cndisp; ++d)\r
- dstr[d * dst_disp_step] = srcr[d * src_disp_step];\r
- }\r
-}\r
+ for (int d = 0; d < cndisp; ++d)\r
+ dstr[d * dst_disp_step] = srcr[d * src_disp_step];\r
+ }\r
+ }\r
\r
-template <typename T>\r
-void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ template <typename T>\r
+ void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- grid.x = divUp(dst_cols, threads.x);\r
- grid.y = divUp(dst_rows, threads.y);\r
+ grid.x = divUp(dst_cols, threads.x);\r
+ grid.y = divUp(dst_rows, threads.y);\r
\r
- int src_idx = (dst_idx + 1) & 1;\r
+ int src_idx = (dst_idx + 1) & 1;\r
\r
- level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);\r
- cudaSafeCall( cudaGetLastError() );\r
+ level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);\r
- cudaSafeCall( cudaGetLastError() );\r
+ level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);\r
- cudaSafeCall( cudaGetLastError() );\r
+ level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);\r
- cudaSafeCall( cudaGetLastError() );\r
+ level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
-template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
+ template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
+ template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
\r
-///////////////////////////////////////////////////////////////\r
-//////////////////// calc all iterations /////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
+ //////////////////// calc all iterations /////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
-template <typename T>\r
-__device__ void calc_min_linear_penalty(T* dst, size_t step)\r
-{\r
- float prev = dst[0];\r
- float cur;\r
- for (int disp = 1; disp < cndisp; ++disp)\r
- {\r
- prev += cdisc_single_jump;\r
- cur = dst[step * disp];\r
- if (prev < cur)\r
+ template <typename T>\r
+ __device__ void calc_min_linear_penalty(T* dst, size_t step)\r
{\r
- cur = prev;\r
- dst[step * disp] = saturate_cast<T>(prev);\r
+ float prev = dst[0];\r
+ float cur;\r
+ for (int disp = 1; disp < cndisp; ++disp)\r
+ {\r
+ prev += cdisc_single_jump;\r
+ cur = dst[step * disp];\r
+ if (prev < cur)\r
+ {\r
+ cur = prev;\r
+ dst[step * disp] = saturate_cast<T>(prev);\r
+ }\r
+ prev = cur;\r
+ }\r
+\r
+ prev = dst[(cndisp - 1) * step];\r
+ for (int disp = cndisp - 2; disp >= 0; disp--)\r
+ {\r
+ prev += cdisc_single_jump;\r
+ cur = dst[step * disp];\r
+ if (prev < cur)\r
+ {\r
+ cur = prev;\r
+ dst[step * disp] = saturate_cast<T>(prev);\r
+ }\r
+ prev = cur;\r
+ }\r
}\r
- prev = cur;\r
- }\r
\r
- prev = dst[(cndisp - 1) * step];\r
- for (int disp = cndisp - 2; disp >= 0; disp--)\r
- {\r
- prev += cdisc_single_jump;\r
- cur = dst[step * disp];\r
- if (prev < cur)\r
+ template <typename T>\r
+ __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)\r
{\r
- cur = prev;\r
- dst[step * disp] = saturate_cast<T>(prev);\r
- }\r
- prev = cur;\r
- }\r
-}\r
+ float minimum = device::numeric_limits<float>::max();\r
\r
-template <typename T>\r
-__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)\r
-{\r
- float minimum = device::numeric_limits<float>::max();\r
+ for(int i = 0; i < cndisp; ++i)\r
+ {\r
+ float dst_reg = msg1[msg_disp_step * i];\r
+ dst_reg += msg2[msg_disp_step * i];\r
+ dst_reg += msg3[msg_disp_step * i];\r
+ dst_reg += data[data_disp_step * i];\r
\r
- for(int i = 0; i < cndisp; ++i)\r
- {\r
- float dst_reg = msg1[msg_disp_step * i];\r
- dst_reg += msg2[msg_disp_step * i];\r
- dst_reg += msg3[msg_disp_step * i];\r
- dst_reg += data[data_disp_step * i];\r
+ if (dst_reg < minimum)\r
+ minimum = dst_reg;\r
\r
- if (dst_reg < minimum)\r
- minimum = dst_reg;\r
+ dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);\r
+ }\r
\r
- dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);\r
- }\r
+ calc_min_linear_penalty(dst, msg_disp_step);\r
\r
- calc_min_linear_penalty(dst, msg_disp_step);\r
+ minimum += cmax_disc_term;\r
\r
- minimum += cmax_disc_term;\r
+ float sum = 0;\r
+ for(int i = 0; i < cndisp; ++i)\r
+ {\r
+ float dst_reg = dst[msg_disp_step * i];\r
+ if (dst_reg > minimum)\r
+ {\r
+ dst_reg = minimum;\r
+ dst[msg_disp_step * i] = saturate_cast<T>(minimum);\r
+ }\r
+ sum += dst_reg;\r
+ }\r
+ sum /= cndisp;\r
\r
- float sum = 0;\r
- for(int i = 0; i < cndisp; ++i)\r
- {\r
- float dst_reg = dst[msg_disp_step * i];\r
- if (dst_reg > minimum)\r
- {\r
- dst_reg = minimum;\r
- dst[msg_disp_step * i] = saturate_cast<T>(minimum);\r
+ for(int i = 0; i < cndisp; ++i)\r
+ dst[msg_disp_step * i] -= sum;\r
}\r
- sum += dst_reg;\r
- }\r
- sum /= cndisp;\r
-\r
- for(int i = 0; i < cndisp; ++i)\r
- dst[msg_disp_step * i] -= sum;\r
-}\r
-\r
-template <typename T>\r
-__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)\r
-{\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
- const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
\r
- if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))\r
- {\r
- T* us = u.ptr(y) + x;\r
- T* ds = d + y * u.step + x;\r
- T* ls = l + y * u.step + x;\r
- T* rs = r + y * u.step + x;\r
- const T* dt = data.ptr(y) + x;\r
-\r
- size_t msg_disp_step = u.step * rows;\r
- size_t data_disp_step = data.step * rows;\r
-\r
- message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);\r
- message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);\r
- message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);\r
- message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);\r
- }\r
-}\r
-\r
-template <typename T>\r
-void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,\r
- const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
-\r
- grid.x = divUp(cols, threads.x << 1);\r
- grid.y = divUp(rows, threads.y);\r
+ template <typename T>\r
+ __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)\r
+ {\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
\r
- for(int t = 0; t < iters; ++t)\r
- {\r
- one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))\r
+ {\r
+ T* us = u.ptr(y) + x;\r
+ T* ds = d + y * u.step + x;\r
+ T* ls = l + y * u.step + x;\r
+ T* rs = r + y * u.step + x;\r
+ const T* dt = data.ptr(y) + x;\r
+\r
+ size_t msg_disp_step = u.step * rows;\r
+ size_t data_disp_step = data.step * rows;\r
+\r
+ message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);\r
+ message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);\r
+ message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);\r
+ message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);\r
+ }\r
+ }\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-}\r
+ template <typename T>\r
+ void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,\r
+ const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
-template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
-template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
+ grid.x = divUp(cols, threads.x << 1);\r
+ grid.y = divUp(rows, threads.y);\r
\r
-///////////////////////////////////////////////////////////////\r
-/////////////////////////// output ////////////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ for(int t = 0; t < iters; ++t)\r
+ {\r
+ one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template <typename T>\r
-__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,\r
- DevMem2D_<short> disp)\r
-{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ }\r
\r
- if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)\r
- {\r
- const T* us = u.ptr(y + 1) + x;\r
- const T* ds = d + (y - 1) * u.step + x;\r
- const T* ls = l + y * u.step + (x + 1);\r
- const T* rs = r + y * u.step + (x - 1);\r
- const T* dt = data + y * u.step + x;\r
+ template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
+ template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
\r
- size_t disp_step = disp.rows * u.step;\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////////////// output ////////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- int best = 0;\r
- float best_val = numeric_limits<float>::max();\r
- for (int d = 0; d < cndisp; ++d)\r
+ template <typename T>\r
+ __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,\r
+ DevMem2D_<short> disp)\r
{\r
- float val = us[d * disp_step];\r
- val += ds[d * disp_step];\r
- val += ls[d * disp_step];\r
- val += rs[d * disp_step];\r
- val += dt[d * disp_step];\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (val < best_val)\r
+ if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)\r
{\r
- best_val = val;\r
- best = d;\r
+ const T* us = u.ptr(y + 1) + x;\r
+ const T* ds = d + (y - 1) * u.step + x;\r
+ const T* ls = l + y * u.step + (x + 1);\r
+ const T* rs = r + y * u.step + (x - 1);\r
+ const T* dt = data + y * u.step + x;\r
+\r
+ size_t disp_step = disp.rows * u.step;\r
+\r
+ int best = 0;\r
+ float best_val = numeric_limits<float>::max();\r
+ for (int d = 0; d < cndisp; ++d)\r
+ {\r
+ float val = us[d * disp_step];\r
+ val += ds[d * disp_step];\r
+ val += ls[d * disp_step];\r
+ val += rs[d * disp_step];\r
+ val += dt[d * disp_step];\r
+\r
+ if (val < best_val)\r
+ {\r
+ best_val = val;\r
+ best = d;\r
+ }\r
+ }\r
+\r
+ disp.ptr(y)[x] = saturate_cast<short>(best);\r
}\r
}\r
\r
- disp.ptr(y)[x] = saturate_cast<short>(best);\r
- }\r
-}\r
-\r
-template <typename T>\r
-void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,\r
- const DevMem2D_<short>& disp, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
-\r
- grid.x = divUp(disp.cols, threads.x);\r
- grid.y = divUp(disp.rows, threads.y);\r
-\r
- output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T>\r
+ void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,\r
+ const DevMem2D_<short>& disp, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ grid.x = divUp(disp.cols, threads.x);\r
+ grid.y = divUp(disp.rows, threads.y);\r
\r
-template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
-template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
+ output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-} // namespace stereobp\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
+ template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
+ } // namespace stereobp\r
+}}} // namespace cv { namespace gpu { namespace device\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereocsbp {\r
-\r
-///////////////////////////////////////////////////////////////\r
-/////////////////////// load constants ////////////////////////\r
-///////////////////////////////////////////////////////////////\r
-\r
-__constant__ int cndisp;\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ namespace stereocsbp \r
+ {\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////////// load constants ////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
-__constant__ float cmax_data_term;\r
-__constant__ float cdata_weight;\r
-__constant__ float cmax_disc_term;\r
-__constant__ float cdisc_single_jump;\r
+ __constant__ int cndisp;\r
\r
-__constant__ int cth;\r
+ __constant__ float cmax_data_term;\r
+ __constant__ float cdata_weight;\r
+ __constant__ float cmax_disc_term;\r
+ __constant__ float cdisc_single_jump;\r
\r
-__constant__ size_t cimg_step;\r
-__constant__ size_t cmsg_step1;\r
-__constant__ size_t cmsg_step2;\r
-__constant__ size_t cdisp_step1;\r
-__constant__ size_t cdisp_step2;\r
+ __constant__ int cth;\r
\r
-__constant__ uchar* cleft;\r
-__constant__ uchar* cright;\r
-__constant__ uchar* ctemp;\r
+ __constant__ size_t cimg_step;\r
+ __constant__ size_t cmsg_step1;\r
+ __constant__ size_t cmsg_step2;\r
+ __constant__ size_t cdisp_step1;\r
+ __constant__ size_t cdisp_step2;\r
\r
+ __constant__ uchar* cleft;\r
+ __constant__ uchar* cright;\r
+ __constant__ uchar* ctemp;\r
\r
-void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
- const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
+ void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
+ const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );\r
\r
- cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );\r
-}\r
+ cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );\r
\r
-///////////////////////////////////////////////////////////////\r
-/////////////////////// init data cost ////////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );\r
+ }\r
\r
-template <int channels> struct DataCostPerPixel;\r
-template <> struct DataCostPerPixel<1>\r
-{\r
- static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
- {\r
- return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);\r
- }\r
-};\r
-template <> struct DataCostPerPixel<3>\r
-{\r
- static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
- {\r
- float tb = 0.114f * ::abs((int)left[0] - right[0]);\r
- float tg = 0.587f * ::abs((int)left[1] - right[1]);\r
- float tr = 0.299f * ::abs((int)left[2] - right[2]);\r
-\r
- return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
- }\r
-};\r
-template <> struct DataCostPerPixel<4>\r
-{\r
- static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
- {\r
- uchar4 l = *((const uchar4*)left);\r
- uchar4 r = *((const uchar4*)right);\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////////// init data cost ////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- float tb = 0.114f * ::abs((int)l.x - r.x);\r
- float tg = 0.587f * ::abs((int)l.y - r.y);\r
- float tr = 0.299f * ::abs((int)l.z - r.z);\r
+ template <int channels> struct DataCostPerPixel;\r
+ template <> struct DataCostPerPixel<1>\r
+ {\r
+ static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
+ {\r
+ return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);\r
+ }\r
+ };\r
+ template <> struct DataCostPerPixel<3>\r
+ {\r
+ static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
+ {\r
+ float tb = 0.114f * ::abs((int)left[0] - right[0]);\r
+ float tg = 0.587f * ::abs((int)left[1] - right[1]);\r
+ float tr = 0.299f * ::abs((int)left[2] - right[2]);\r
\r
- return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
- }\r
-};\r
+ return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
+ }\r
+ };\r
+ template <> struct DataCostPerPixel<4>\r
+ {\r
+ static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
+ {\r
+ uchar4 l = *((const uchar4*)left);\r
+ uchar4 r = *((const uchar4*)right);\r
\r
-template <typename T>\r
-__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ float tb = 0.114f * ::abs((int)l.x - r.x);\r
+ float tg = 0.587f * ::abs((int)l.y - r.y);\r
+ float tr = 0.299f * ::abs((int)l.z - r.z);\r
\r
- if (y < h && x < w)\r
- {\r
- T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
- T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
- T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+ return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
+ }\r
+ };\r
\r
- for(int i = 0; i < nr_plane; i++)\r
+ template <typename T>\r
+ __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)\r
{\r
- T minimum = device::numeric_limits<T>::max();\r
- int id = 0;\r
- for(int d = 0; d < cndisp; d++)\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (y < h && x < w)\r
{\r
- T cur = data_cost[d * cdisp_step1];\r
- if(cur < minimum)\r
+ T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
+ T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+ T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+\r
+ for(int i = 0; i < nr_plane; i++)\r
{\r
- minimum = cur;\r
- id = d;\r
+ T minimum = device::numeric_limits<T>::max();\r
+ int id = 0;\r
+ for(int d = 0; d < cndisp; d++)\r
+ {\r
+ T cur = data_cost[d * cdisp_step1];\r
+ if(cur < minimum)\r
+ {\r
+ minimum = cur;\r
+ id = d;\r
+ }\r
+ }\r
+\r
+ data_cost_selected[i * cdisp_step1] = minimum;\r
+ selected_disparity[i * cdisp_step1] = id;\r
+ data_cost [id * cdisp_step1] = numeric_limits<T>::max();\r
}\r
}\r
-\r
- data_cost_selected[i * cdisp_step1] = minimum;\r
- selected_disparity[i * cdisp_step1] = id;\r
- data_cost [id * cdisp_step1] = numeric_limits<T>::max();\r
}\r
- }\r
-}\r
\r
\r
-template <typename T>\r
-__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ template <typename T>\r
+ __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (y < h && x < w)\r
- {\r
- T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
- T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
- T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+ if (y < h && x < w)\r
+ {\r
+ T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
+ T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+ T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
\r
- int nr_local_minimum = 0;\r
+ int nr_local_minimum = 0;\r
\r
- T prev = data_cost[0 * cdisp_step1];\r
- T cur = data_cost[1 * cdisp_step1];\r
- T next = data_cost[2 * cdisp_step1];\r
+ T prev = data_cost[0 * cdisp_step1];\r
+ T cur = data_cost[1 * cdisp_step1];\r
+ T next = data_cost[2 * cdisp_step1];\r
\r
- for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)\r
- {\r
- if (cur < prev && cur < next)\r
- {\r
- data_cost_selected[nr_local_minimum * cdisp_step1] = cur;\r
- selected_disparity[nr_local_minimum * cdisp_step1] = d;\r
+ for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)\r
+ {\r
+ if (cur < prev && cur < next)\r
+ {\r
+ data_cost_selected[nr_local_minimum * cdisp_step1] = cur;\r
+ selected_disparity[nr_local_minimum * cdisp_step1] = d;\r
+\r
+ data_cost[d * cdisp_step1] = numeric_limits<T>::max();\r
+\r
+ nr_local_minimum++;\r
+ }\r
+ prev = cur;\r
+ cur = next;\r
+ next = data_cost[(d + 1) * cdisp_step1];\r
+ }\r
+\r
+ for (int i = nr_local_minimum; i < nr_plane; i++)\r
+ {\r
+ T minimum = numeric_limits<T>::max();\r
+ int id = 0;\r
\r
- data_cost[d * cdisp_step1] = numeric_limits<T>::max();\r
+ for (int d = 0; d < cndisp; d++)\r
+ {\r
+ cur = data_cost[d * cdisp_step1];\r
+ if (cur < minimum)\r
+ {\r
+ minimum = cur;\r
+ id = d;\r
+ }\r
+ }\r
+ data_cost_selected[i * cdisp_step1] = minimum;\r
+ selected_disparity[i * cdisp_step1] = id;\r
\r
- nr_local_minimum++;\r
+ data_cost[id * cdisp_step1] = numeric_limits<T>::max();\r
+ }\r
}\r
- prev = cur;\r
- cur = next;\r
- next = data_cost[(d + 1) * cdisp_step1];\r
}\r
\r
- for (int i = nr_local_minimum; i < nr_plane; i++)\r
+ template <typename T, int channels>\r
+ __global__ void init_data_cost(int h, int w, int level)\r
{\r
- T minimum = numeric_limits<T>::max();\r
- int id = 0;\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- for (int d = 0; d < cndisp; d++)\r
+ if (y < h && x < w)\r
{\r
- cur = data_cost[d * cdisp_step1];\r
- if (cur < minimum)\r
+ int y0 = y << level;\r
+ int yt = (y + 1) << level;\r
+\r
+ int x0 = x << level;\r
+ int xt = (x + 1) << level;\r
+\r
+ T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+\r
+ for(int d = 0; d < cndisp; ++d)\r
{\r
- minimum = cur;\r
- id = d;\r
+ float val = 0.0f;\r
+ for(int yi = y0; yi < yt; yi++)\r
+ {\r
+ for(int xi = x0; xi < xt; xi++)\r
+ {\r
+ int xr = xi - d;\r
+ if(d < cth || xr < 0)\r
+ val += cdata_weight * cmax_data_term;\r
+ else\r
+ {\r
+ const uchar* lle = cleft + yi * cimg_step + xi * channels;\r
+ const uchar* lri = cright + yi * cimg_step + xr * channels;\r
+\r
+ val += DataCostPerPixel<channels>::compute(lle, lri);\r
+ }\r
+ }\r
+ }\r
+ data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
}\r
}\r
- data_cost_selected[i * cdisp_step1] = minimum;\r
- selected_disparity[i * cdisp_step1] = id;\r
-\r
- data_cost[id * cdisp_step1] = numeric_limits<T>::max();\r
}\r
- }\r
-}\r
\r
-template <typename T, int channels>\r
-__global__ void init_data_cost(int h, int w, int level)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ template <typename T, int winsz, int channels>\r
+ __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)\r
+ {\r
+ int x_out = blockIdx.x;\r
+ int y_out = blockIdx.y % h;\r
+ int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
\r
- if (y < h && x < w)\r
- {\r
- int y0 = y << level;\r
- int yt = (y + 1) << level;\r
+ int tid = threadIdx.x;\r
\r
- int x0 = x << level;\r
- int xt = (x + 1) << level;\r
+ if (d < cndisp)\r
+ {\r
+ int x0 = x_out << level;\r
+ int y0 = y_out << level;\r
\r
- T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+ int len = ::min(y0 + winsz, rows) - y0;\r
\r
- for(int d = 0; d < cndisp; ++d)\r
- {\r
- float val = 0.0f;\r
- for(int yi = y0; yi < yt; yi++)\r
- {\r
- for(int xi = x0; xi < xt; xi++)\r
+ float val = 0.0f;\r
+ if (x0 + tid < cols)\r
{\r
- int xr = xi - d;\r
- if(d < cth || xr < 0)\r
- val += cdata_weight * cmax_data_term;\r
+ if (x0 + tid - d < 0 || d < cth)\r
+ val = cdata_weight * cmax_data_term * len;\r
else\r
{\r
- const uchar* lle = cleft + yi * cimg_step + xi * channels;\r
- const uchar* lri = cright + yi * cimg_step + xr * channels;\r
+ const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );\r
+ const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);\r
+\r
+ for(int y = 0; y < len; ++y)\r
+ {\r
+ val += DataCostPerPixel<channels>::compute(lle, lri);\r
\r
- val += DataCostPerPixel<channels>::compute(lle, lri);\r
+ lle += cimg_step;\r
+ lri += cimg_step;\r
+ }\r
}\r
}\r
- }\r
- data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
- }\r
- }\r
-}\r
\r
-template <typename T, int winsz, int channels>\r
-__global__ void init_data_cost_reduce(int level, int rows, int cols, int h)\r
-{\r
- int x_out = blockIdx.x;\r
- int y_out = blockIdx.y % h;\r
- int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
+ extern __shared__ float smem[];\r
+ float* dline = smem + winsz * threadIdx.z;\r
\r
- int tid = threadIdx.x;\r
+ dline[tid] = val;\r
\r
- if (d < cndisp)\r
- {\r
- int x0 = x_out << level;\r
- int y0 = y_out << level;\r
+ __syncthreads();\r
\r
- int len = ::min(y0 + winsz, rows) - y0;\r
+ if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
+ if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
\r
- float val = 0.0f;\r
- if (x0 + tid < cols)\r
- {\r
- if (x0 + tid - d < 0 || d < cth)\r
- val = cdata_weight * cmax_data_term * len;\r
- else\r
- {\r
- const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );\r
- const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);\r
+ volatile float* vdline = smem + winsz * threadIdx.z;\r
\r
- for(int y = 0; y < len; ++y)\r
- {\r
- val += DataCostPerPixel<channels>::compute(lle, lri);\r
+ if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
+ if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
+ if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];\r
+ if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];\r
+ if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];\r
+ if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];\r
\r
- lle += cimg_step;\r
- lri += cimg_step;\r
- }\r
+ T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;\r
+\r
+ if (tid == 0)\r
+ data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
}\r
}\r
\r
- extern __shared__ float smem[];\r
- float* dline = smem + winsz * threadIdx.z;\r
\r
- dline[tid] = val;\r
+ template <typename T>\r
+ void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- __syncthreads();\r
+ grid.x = divUp(w, threads.x);\r
+ grid.y = divUp(h, threads.y);\r
\r
- if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
- if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
+ switch (channels)\r
+ {\r
+ case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+ case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+ case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+ default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
+ }\r
+ }\r
\r
- volatile float* vdline = smem + winsz * threadIdx.z;\r
+ template <typename T, int winsz>\r
+ void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)\r
+ {\r
+ const int threadsNum = 256;\r
+ const size_t smem_size = threadsNum * sizeof(float);\r
\r
- if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
- if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
- if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];\r
- if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];\r
- if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];\r
- if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];\r
+ dim3 threads(winsz, 1, threadsNum / winsz);\r
+ dim3 grid(w, h, 1);\r
+ grid.y *= divUp(ndisp, threads.z);\r
\r
- T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;\r
+ switch (channels)\r
+ {\r
+ case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+ case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+ case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+ default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
+ }\r
+ }\r
\r
- if (tid == 0)\r
- data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
- }\r
-}\r
+ template<class T>\r
+ void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
+ int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)\r
+ {\r
\r
+ typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);\r
\r
-template <typename T>\r
-void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ static const InitDataCostCaller init_data_cost_callers[] =\r
+ {\r
+ init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,\r
+ init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,\r
+ init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>\r
+ };\r
\r
- grid.x = divUp(w, threads.x);\r
- grid.y = divUp(h, threads.y);\r
+ size_t disp_step = msg_step * h;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
\r
- switch (channels)\r
- {\r
- case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;\r
- case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;\r
- case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;\r
- default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
- }\r
-}\r
-\r
-template <typename T, int winsz>\r
-void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)\r
-{\r
- const int threadsNum = 256;\r
- const size_t smem_size = threadsNum * sizeof(float);\r
+ init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- dim3 threads(winsz, 1, threadsNum / winsz);\r
- dim3 grid(w, h, 1);\r
- grid.y *= divUp(ndisp, threads.z);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- switch (channels)\r
- {\r
- case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
- case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
- case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
- default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
- }\r
-}\r
-\r
-template<class T>\r
-void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
- int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)\r
-{\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);\r
+ grid.x = divUp(w, threads.x);\r
+ grid.y = divUp(h, threads.y);\r
\r
- static const InitDataCostCaller init_data_cost_callers[] =\r
- {\r
- init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,\r
- init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,\r
- init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>\r
- };\r
+ if (use_local_init_data_cost == true)\r
+ get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
+ else\r
+ get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
+ \r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- size_t disp_step = msg_step * h;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
+ template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,\r
+ int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
\r
- init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,\r
+ int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ ///////////////////////////////////////////////////////////////\r
+ ////////////////////// compute data cost //////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ template <typename T, int channels>\r
+ __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- grid.x = divUp(w, threads.x);\r
- grid.y = divUp(h, threads.y);\r
+ if (y < h && x < w)\r
+ {\r
+ int y0 = y << level;\r
+ int yt = (y + 1) << level;\r
\r
- if (use_local_init_data_cost == true)\r
- get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
- else\r
- get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
- \r
- cudaSafeCall( cudaGetLastError() );\r
+ int x0 = x << level;\r
+ int xt = (x + 1) << level;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;\r
+ T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
\r
-template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,\r
- int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
+ for(int d = 0; d < nr_plane; d++)\r
+ {\r
+ float val = 0.0f;\r
+ for(int yi = y0; yi < yt; yi++)\r
+ {\r
+ for(int xi = x0; xi < xt; xi++)\r
+ {\r
+ int sel_disp = selected_disparity[d * cdisp_step2];\r
+ int xr = xi - sel_disp;\r
+\r
+ if (xr < 0 || sel_disp < cth)\r
+ val += cdata_weight * cmax_data_term;\r
+ else\r
+ {\r
+ const uchar* left_x = cleft + yi * cimg_step + xi * channels;\r
+ const uchar* right_x = cright + yi * cimg_step + xr * channels;\r
+\r
+ val += DataCostPerPixel<channels>::compute(left_x, right_x);\r
+ }\r
+ }\r
+ }\r
+ data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
+ }\r
+ }\r
+ }\r
\r
-template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,\r
- int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
+ template <typename T, int winsz, int channels>\r
+ __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)\r
+ {\r
+ int x_out = blockIdx.x;\r
+ int y_out = blockIdx.y % h;\r
+ int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
\r
-///////////////////////////////////////////////////////////////\r
-////////////////////// compute data cost //////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ int tid = threadIdx.x;\r
\r
-template <typename T, int channels>\r
-__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;\r
+ T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;\r
\r
- if (y < h && x < w)\r
- {\r
- int y0 = y << level;\r
- int yt = (y + 1) << level;\r
+ if (d < nr_plane)\r
+ {\r
+ int sel_disp = selected_disparity[d * cdisp_step2];\r
\r
- int x0 = x << level;\r
- int xt = (x + 1) << level;\r
+ int x0 = x_out << level;\r
+ int y0 = y_out << level;\r
\r
- const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;\r
- T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
+ int len = ::min(y0 + winsz, rows) - y0;\r
\r
- for(int d = 0; d < nr_plane; d++)\r
- {\r
- float val = 0.0f;\r
- for(int yi = y0; yi < yt; yi++)\r
- {\r
- for(int xi = x0; xi < xt; xi++)\r
+ float val = 0.0f;\r
+ if (x0 + tid < cols)\r
{\r
- int sel_disp = selected_disparity[d * cdisp_step2];\r
- int xr = xi - sel_disp;\r
-\r
- if (xr < 0 || sel_disp < cth)\r
- val += cdata_weight * cmax_data_term;\r
+ if (x0 + tid - sel_disp < 0 || sel_disp < cth)\r
+ val = cdata_weight * cmax_data_term * len;\r
else\r
{\r
- const uchar* left_x = cleft + yi * cimg_step + xi * channels;\r
- const uchar* right_x = cright + yi * cimg_step + xr * channels;\r
+ const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );\r
+ const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);\r
+\r
+ for(int y = 0; y < len; ++y)\r
+ {\r
+ val += DataCostPerPixel<channels>::compute(lle, lri);\r
\r
- val += DataCostPerPixel<channels>::compute(left_x, right_x);\r
+ lle += cimg_step;\r
+ lri += cimg_step;\r
+ }\r
}\r
}\r
- }\r
- data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
- }\r
- }\r
-}\r
\r
-template <typename T, int winsz, int channels>\r
-__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)\r
-{\r
- int x_out = blockIdx.x;\r
- int y_out = blockIdx.y % h;\r
- int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
+ extern __shared__ float smem[];\r
+ float* dline = smem + winsz * threadIdx.z;\r
\r
- int tid = threadIdx.x;\r
+ dline[tid] = val;\r
\r
- const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;\r
- T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;\r
+ __syncthreads();\r
\r
- if (d < nr_plane)\r
- {\r
- int sel_disp = selected_disparity[d * cdisp_step2];\r
+ if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
+ if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
\r
- int x0 = x_out << level;\r
- int y0 = y_out << level;\r
+ volatile float* vdline = smem + winsz * threadIdx.z;\r
\r
- int len = ::min(y0 + winsz, rows) - y0;\r
+ if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
+ if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
+ if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];\r
+ if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];\r
+ if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];\r
+ if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];\r
+\r
+ if (tid == 0)\r
+ data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
+ }\r
+ }\r
\r
- float val = 0.0f;\r
- if (x0 + tid < cols)\r
+ template <typename T>\r
+ void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,\r
+ int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
{\r
- if (x0 + tid - sel_disp < 0 || sel_disp < cth)\r
- val = cdata_weight * cmax_data_term * len;\r
- else\r
- {\r
- const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );\r
- const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- for(int y = 0; y < len; ++y)\r
- {\r
- val += DataCostPerPixel<channels>::compute(lle, lri);\r
+ grid.x = divUp(w, threads.x);\r
+ grid.y = divUp(h, threads.y);\r
\r
- lle += cimg_step;\r
- lri += cimg_step;\r
- }\r
+ switch(channels)\r
+ {\r
+ case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+ case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+ case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+ default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
}\r
}\r
\r
- extern __shared__ float smem[];\r
- float* dline = smem + winsz * threadIdx.z;\r
-\r
- dline[tid] = val;\r
+ template <typename T, int winsz>\r
+ void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
+ int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
+ {\r
+ const int threadsNum = 256;\r
+ const size_t smem_size = threadsNum * sizeof(float);\r
\r
- __syncthreads();\r
+ dim3 threads(winsz, 1, threadsNum / winsz);\r
+ dim3 grid(w, h, 1);\r
+ grid.y *= divUp(nr_plane, threads.z);\r
\r
- if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
- if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
+ switch (channels)\r
+ {\r
+ case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+ case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+ case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+ default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
+ }\r
+ }\r
\r
- volatile float* vdline = smem + winsz * threadIdx.z;\r
+ template<class T>\r
+ void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)\r
+ {\r
+ typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
+ int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);\r
\r
- if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
- if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
- if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];\r
- if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];\r
- if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];\r
- if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];\r
+ static const ComputeDataCostCaller callers[] =\r
+ {\r
+ compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,\r
+ compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,\r
+ compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>\r
+ };\r
+\r
+ size_t disp_step1 = msg_step1 * h;\r
+ size_t disp_step2 = msg_step2 * h2;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );\r
+\r
+ callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- if (tid == 0)\r
- data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
- }\r
-}\r
+ template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
\r
-template <typename T>\r
-void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,\r
- int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
-{\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
+ \r
\r
- grid.x = divUp(w, threads.x);\r
- grid.y = divUp(h, threads.y);\r
+ ///////////////////////////////////////////////////////////////\r
+ //////////////////////// init message /////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- switch(channels)\r
- {\r
- case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
- case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
- case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
- default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
- }\r
-}\r
-\r
-template <typename T, int winsz>\r
-void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
- int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
-{\r
- const int threadsNum = 256;\r
- const size_t smem_size = threadsNum * sizeof(float);\r
+ \r
+ template <typename T>\r
+ __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,\r
+ const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
+ T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,\r
+ const T* data_cost_cur, const T* disparity_selected_cur,\r
+ int nr_plane, int nr_plane2)\r
+ {\r
+ for(int i = 0; i < nr_plane; i++)\r
+ {\r
+ T minimum = numeric_limits<T>::max();\r
+ int id = 0;\r
+ for(int j = 0; j < nr_plane2; j++)\r
+ {\r
+ T cur = data_cost_new[j * cdisp_step1];\r
+ if(cur < minimum)\r
+ {\r
+ minimum = cur;\r
+ id = j;\r
+ }\r
+ }\r
\r
- dim3 threads(winsz, 1, threadsNum / winsz);\r
- dim3 grid(w, h, 1);\r
- grid.y *= divUp(nr_plane, threads.z);\r
+ data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];\r
+ disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];\r
\r
- switch (channels)\r
- {\r
- case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
- case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
- case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
- default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
- }\r
-}\r
-\r
-template<class T>\r
-void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
- int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)\r
-{\r
- typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
- int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);\r
+ u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];\r
+ d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];\r
+ l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];\r
+ r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];\r
\r
- static const ComputeDataCostCaller callers[] =\r
- {\r
- compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,\r
- compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,\r
- compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>\r
- };\r
-\r
- size_t disp_step1 = msg_step1 * h;\r
- size_t disp_step2 = msg_step2 * h2;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );\r
-\r
- callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,\r
- int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
-\r
-template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,\r
- int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
- \r
-\r
-///////////////////////////////////////////////////////////////\r
-//////////////////////// init message /////////////////////////\r
-///////////////////////////////////////////////////////////////\r
-\r
- \r
- template <typename T>\r
-__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,\r
- const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
- T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,\r
- const T* data_cost_cur, const T* disparity_selected_cur,\r
- int nr_plane, int nr_plane2)\r
-{\r
- for(int i = 0; i < nr_plane; i++)\r
- {\r
- T minimum = numeric_limits<T>::max();\r
- int id = 0;\r
- for(int j = 0; j < nr_plane2; j++)\r
- {\r
- T cur = data_cost_new[j * cdisp_step1];\r
- if(cur < minimum)\r
- {\r
- minimum = cur;\r
- id = j;\r
+ data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();\r
}\r
}\r
\r
- data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];\r
- disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];\r
+ template <typename T>\r
+ __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,\r
+ const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,\r
+ T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
+ T* data_cost_selected_, const T* data_cost_,\r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (y < h && x < w)\r
+ {\r
+ const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;\r
+ const T* d_cur = d_cur_ + ::max(0, y/2 - 1) * cmsg_step2 + x/2;\r
+ const T* l_cur = l_cur_ + y/2 * cmsg_step2 + ::min(w2-1, x/2 + 1);\r
+ const T* r_cur = r_cur_ + y/2 * cmsg_step2 + ::max(0, x/2 - 1);\r
\r
- u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];\r
- d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];\r
- l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];\r
- r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];\r
+ T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;\r
\r
- data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();\r
- }\r
-}\r
+ const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;\r
+ const T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
\r
-template <typename T>\r
-__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,\r
- const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,\r
- T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
- T* data_cost_selected_, const T* data_cost_,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ for(int d = 0; d < nr_plane2; d++)\r
+ {\r
+ int idx2 = d * cdisp_step2;\r
\r
- if (y < h && x < w)\r
- {\r
- const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;\r
- const T* d_cur = d_cur_ + ::max(0, y/2 - 1) * cmsg_step2 + x/2;\r
- const T* l_cur = l_cur_ + y/2 * cmsg_step2 + ::min(w2-1, x/2 + 1);\r
- const T* r_cur = r_cur_ + y/2 * cmsg_step2 + ::max(0, x/2 - 1);\r
+ T val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];\r
+ data_cost_new[d * cdisp_step1] = val;\r
+ }\r
\r
- T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;\r
+ T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+ T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;\r
\r
- const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;\r
- const T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
+ T* u_new = u_new_ + y * cmsg_step1 + x;\r
+ T* d_new = d_new_ + y * cmsg_step1 + x;\r
+ T* l_new = l_new_ + y * cmsg_step1 + x;\r
+ T* r_new = r_new_ + y * cmsg_step1 + x;\r
\r
- for(int d = 0; d < nr_plane2; d++)\r
- {\r
- int idx2 = d * cdisp_step2;\r
+ u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;\r
+ d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;\r
+ l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;\r
+ r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;\r
\r
- T val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];\r
- data_cost_new[d * cdisp_step1] = val;\r
+ get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,\r
+ data_cost_selected, disparity_selected_new, data_cost_new,\r
+ data_cost, disparity_selected_cur, nr_plane, nr_plane2);\r
+ }\r
}\r
\r
- T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
- T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;\r
\r
- T* u_new = u_new_ + y * cmsg_step1 + x;\r
- T* d_new = d_new_ + y * cmsg_step1 + x;\r
- T* l_new = l_new_ + y * cmsg_step1 + x;\r
- T* r_new = r_new_ + y * cmsg_step1 + x;\r
+ template<class T>\r
+ void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
+ const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
+ T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
+ T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)\r
+ {\r
\r
- u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;\r
- d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;\r
- l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;\r
- r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;\r
+ size_t disp_step1 = msg_step1 * h;\r
+ size_t disp_step2 = msg_step2 * h2;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );\r
\r
- get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,\r
- data_cost_selected, disparity_selected_new, data_cost_new,\r
- data_cost, disparity_selected_cur, nr_plane, nr_plane2);\r
- }\r
-}\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
+ grid.x = divUp(w, threads.x);\r
+ grid.y = divUp(h, threads.y);\r
\r
-template<class T>\r
-void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
- const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
- T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
- T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)\r
-{\r
+ init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,\r
+ u_cur, d_cur, l_cur, r_cur,\r
+ selected_disp_pyr_new, selected_disp_pyr_cur,\r
+ data_cost_selected, data_cost,\r
+ h, w, nr_plane, h2, w2, nr_plane2);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- size_t disp_step1 = msg_step1 * h;\r
- size_t disp_step2 = msg_step2 * h2;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );\r
-\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
-\r
- grid.x = divUp(w, threads.x);\r
- grid.y = divUp(h, threads.y);\r
-\r
- init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,\r
- u_cur, d_cur, l_cur, r_cur,\r
- selected_disp_pyr_new, selected_disp_pyr_cur,\r
- data_cost_selected, data_cost,\r
- h, w, nr_plane, h2, w2, nr_plane2);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
-\r
-template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,\r
- const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,\r
- short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,\r
- short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
-\r
-template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,\r
- const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,\r
- float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,\r
- float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream); \r
-\r
-///////////////////////////////////////////////////////////////\r
-//////////////////// calc all iterations /////////////////////\r
-///////////////////////////////////////////////////////////////\r
-\r
-template <typename T>\r
-__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,\r
- const T* dst_disp, const T* src_disp, int nr_plane, T* temp)\r
-{\r
- T minimum = numeric_limits<T>::max();\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- for(int d = 0; d < nr_plane; d++)\r
- {\r
- int idx = d * cdisp_step1;\r
- T val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];\r
\r
- if(val < minimum)\r
- minimum = val;\r
+ template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,\r
+ const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,\r
+ short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,\r
+ short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
\r
- msg_dst[idx] = val;\r
- }\r
+ template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,\r
+ const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,\r
+ float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,\r
+ float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream); \r
\r
- float sum = 0;\r
- for(int d = 0; d < nr_plane; d++)\r
- {\r
- float cost_min = minimum + cmax_disc_term;\r
- T src_disp_reg = src_disp[d * cdisp_step1];\r
+ ///////////////////////////////////////////////////////////////\r
+ //////////////////// calc all iterations /////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- for(int d2 = 0; d2 < nr_plane; d2++)\r
- cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));\r
+ template <typename T>\r
+ __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,\r
+ const T* dst_disp, const T* src_disp, int nr_plane, T* temp)\r
+ {\r
+ T minimum = numeric_limits<T>::max();\r
\r
- temp[d * cdisp_step1] = saturate_cast<T>(cost_min);\r
- sum += cost_min;\r
- }\r
- sum /= nr_plane;\r
+ for(int d = 0; d < nr_plane; d++)\r
+ {\r
+ int idx = d * cdisp_step1;\r
+ T val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];\r
\r
- for(int d = 0; d < nr_plane; d++)\r
- msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);\r
-}\r
+ if(val < minimum)\r
+ minimum = val;\r
\r
-template <typename T>\r
-__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)\r
-{\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
- int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);\r
+ msg_dst[idx] = val;\r
+ }\r
\r
- if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
- {\r
- const T* data = data_cost_selected + y * cmsg_step1 + x;\r
+ float sum = 0;\r
+ for(int d = 0; d < nr_plane; d++)\r
+ {\r
+ float cost_min = minimum + cmax_disc_term;\r
+ T src_disp_reg = src_disp[d * cdisp_step1];\r
+\r
+ for(int d2 = 0; d2 < nr_plane; d2++)\r
+ cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));\r
+\r
+ temp[d * cdisp_step1] = saturate_cast<T>(cost_min);\r
+ sum += cost_min;\r
+ }\r
+ sum /= nr_plane;\r
\r
- T* u = u_ + y * cmsg_step1 + x;\r
- T* d = d_ + y * cmsg_step1 + x;\r
- T* l = l_ + y * cmsg_step1 + x;\r
- T* r = r_ + y * cmsg_step1 + x;\r
+ for(int d = 0; d < nr_plane; d++)\r
+ msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);\r
+ }\r
\r
- const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;\r
+ template <typename T>\r
+ __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)\r
+ {\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);\r
\r
- T* temp = (T*)ctemp + y * cmsg_step1 + x;\r
+ if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
+ {\r
+ const T* data = data_cost_selected + y * cmsg_step1 + x;\r
\r
- message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);\r
- message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);\r
- message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);\r
- message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);\r
- }\r
-}\r
+ T* u = u_ + y * cmsg_step1 + x;\r
+ T* d = d_ + y * cmsg_step1 + x;\r
+ T* l = l_ + y * cmsg_step1 + x;\r
+ T* r = r_ + y * cmsg_step1 + x;\r
\r
+ const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;\r
\r
-template<class T>\r
-void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
- const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)\r
-{\r
- size_t disp_step = msg_step * h;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
+ T* temp = (T*)ctemp + y * cmsg_step1 + x;\r
\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);\r
+ message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);\r
+ message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);\r
+ message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);\r
+ }\r
+ }\r
\r
- grid.x = divUp(w, threads.x << 1);\r
- grid.y = divUp(h, threads.y);\r
\r
- for(int t = 0; t < iters; ++t)\r
- {\r
- compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template<class T>\r
+ void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
+ const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)\r
+ {\r
+ size_t disp_step = msg_step * h;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-};\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
-template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,\r
- int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+ grid.x = divUp(w, threads.x << 1);\r
+ grid.y = divUp(h, threads.y);\r
\r
-template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, \r
- int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+ for(int t = 0; t < iters; ++t)\r
+ {\r
+ compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ };\r
\r
-///////////////////////////////////////////////////////////////\r
-/////////////////////////// output ////////////////////////////\r
-///////////////////////////////////////////////////////////////\r
+ template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,\r
+ int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
\r
+ template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, \r
+ int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
\r
-template <typename T>\r
-__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,\r
- const T* data_cost_selected, const T* disp_selected_pyr,\r
- short* disp, size_t res_step, int cols, int rows, int nr_plane)\r
-{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)\r
- {\r
- const T* data = data_cost_selected + y * cmsg_step1 + x;\r
- const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;\r
+ ///////////////////////////////////////////////////////////////\r
+ /////////////////////////// output ////////////////////////////\r
+ ///////////////////////////////////////////////////////////////\r
\r
- const T* u = u_ + (y+1) * cmsg_step1 + (x+0);\r
- const T* d = d_ + (y-1) * cmsg_step1 + (x+0);\r
- const T* l = l_ + (y+0) * cmsg_step1 + (x+1);\r
- const T* r = r_ + (y+0) * cmsg_step1 + (x-1);\r
\r
- int best = 0;\r
- T best_val = numeric_limits<T>::max();\r
- for (int i = 0; i < nr_plane; ++i)\r
+ template <typename T>\r
+ __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,\r
+ const T* data_cost_selected, const T* disp_selected_pyr,\r
+ short* disp, size_t res_step, int cols, int rows, int nr_plane)\r
{\r
- int idx = i * cdisp_step1;\r
- T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
\r
- if (val < best_val)\r
+ if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)\r
{\r
- best_val = val;\r
- best = saturate_cast<short>(disp_selected[idx]);\r
- }\r
- }\r
- disp[res_step * y + x] = best;\r
- }\r
-}\r
+ const T* data = data_cost_selected + y * cmsg_step1 + x;\r
+ const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;\r
\r
-template<class T>\r
-void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
- const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)\r
-{\r
- size_t disp_step = disp.rows * msg_step;\r
- cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
+ const T* u = u_ + (y+1) * cmsg_step1 + (x+0);\r
+ const T* d = d_ + (y-1) * cmsg_step1 + (x+0);\r
+ const T* l = l_ + (y+0) * cmsg_step1 + (x+1);\r
+ const T* r = r_ + (y+0) * cmsg_step1 + (x-1);\r
\r
- dim3 threads(32, 8, 1);\r
- dim3 grid(1, 1, 1);\r
+ int best = 0;\r
+ T best_val = numeric_limits<T>::max();\r
+ for (int i = 0; i < nr_plane; ++i)\r
+ {\r
+ int idx = i * cdisp_step1;\r
+ T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];\r
+\r
+ if (val < best_val)\r
+ {\r
+ best_val = val;\r
+ best = saturate_cast<short>(disp_selected[idx]);\r
+ }\r
+ }\r
+ disp[res_step * y + x] = best;\r
+ }\r
+ }\r
\r
- grid.x = divUp(disp.cols, threads.x);\r
- grid.y = divUp(disp.rows, threads.y);\r
+ template<class T>\r
+ void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
+ const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)\r
+ {\r
+ size_t disp_step = disp.rows * msg_step;\r
+ cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );\r
\r
- compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,\r
- disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threads(32, 8, 1);\r
+ dim3 grid(1, 1, 1);\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ grid.x = divUp(disp.cols, threads.x);\r
+ grid.y = divUp(disp.rows, threads.y);\r
\r
-template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, \r
- const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+ compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,\r
+ disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,\r
- const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-} // namespace stereocsbp\r
+ template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, \r
+ const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,\r
+ const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+ } // namespace stereocsbp\r
+}}} // namespace cv { namespace gpu { namespace device {\r
#include "opencv2/gpu/device/functional.hpp"\r
#include "opencv2/gpu/device/filters.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace surf {\r
-\r
-////////////////////////////////////////////////////////////////////////\r
-// Global parameters\r
-\r
-// The maximum number of features (before subpixel interpolation) that memory is reserved for.\r
-__constant__ int c_max_candidates;\r
-// The maximum number of features that memory is reserved for.\r
-__constant__ int c_max_features;\r
-// The image size.\r
-__constant__ int c_img_rows;\r
-__constant__ int c_img_cols;\r
-// The number of layers.\r
-__constant__ int c_nOctaveLayers;\r
-// The hessian threshold.\r
-__constant__ float c_hessianThreshold;\r
-\r
-// The current octave.\r
-__constant__ int c_octave;\r
-// The current layer size.\r
-__constant__ int c_layer_rows;\r
-__constant__ int c_layer_cols;\r
-\r
-void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );\r
-}\r
-\r
-void loadOctaveConstants(int octave, int layer_rows, int layer_cols)\r
-{\r
- cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );\r
- cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );\r
-}\r
-\r
-////////////////////////////////////////////////////////////////////////\r
-// Integral image texture\r
+ namespace surf \r
+ {\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // Global parameters\r
+\r
+ // The maximum number of features (before subpixel interpolation) that memory is reserved for.\r
+ __constant__ int c_max_candidates;\r
+ // The maximum number of features that memory is reserved for.\r
+ __constant__ int c_max_features;\r
+ // The image size.\r
+ __constant__ int c_img_rows;\r
+ __constant__ int c_img_cols;\r
+ // The number of layers.\r
+ __constant__ int c_nOctaveLayers;\r
+ // The hessian threshold.\r
+ __constant__ float c_hessianThreshold;\r
+\r
+ // The current octave.\r
+ __constant__ int c_octave;\r
+ // The current layer size.\r
+ __constant__ int c_layer_rows;\r
+ __constant__ int c_layer_cols;\r
+\r
+ void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );\r
+ }\r
\r
-texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
-texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
-texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+ void loadOctaveConstants(int octave, int layer_rows, int layer_cols)\r
+ {\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );\r
+ cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );\r
+ }\r
\r
-void bindImgTex(DevMem2Db img)\r
-{\r
- bindTexture(&imgTex, img);\r
-}\r
-void bindSumTex(DevMem2D_<uint> sum)\r
-{\r
- bindTexture(&sumTex, sum);\r
-}\r
-void bindMaskSumTex(DevMem2D_<uint> maskSum)\r
-{\r
- bindTexture(&maskSumTex, maskSum);\r
-}\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // Integral image texture\r
\r
-template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
-{\r
-#if __CUDA_ARCH__ >= 200\r
- typedef double real_t; \r
-#else\r
- typedef float real_t;\r
-#endif\r
-\r
- float ratio = (float)newSize / oldSize;\r
- \r
- real_t d = 0;\r
-\r
- #pragma unroll\r
- for (int k = 0; k < N; ++k)\r
- {\r
- int dx1 = __float2int_rn(ratio * src[k][0]);\r
- int dy1 = __float2int_rn(ratio * src[k][1]);\r
- int dx2 = __float2int_rn(ratio * src[k][2]);\r
- int dy2 = __float2int_rn(ratio * src[k][3]);\r
+ texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+ texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+ texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
\r
- real_t t = 0;\r
- t += tex2D(sumTex, x + dx1, y + dy1);\r
- t -= tex2D(sumTex, x + dx1, y + dy2);\r
- t -= tex2D(sumTex, x + dx2, y + dy1);\r
- t += tex2D(sumTex, x + dx2, y + dy2);\r
+ void bindImgTex(DevMem2Db img)\r
+ {\r
+ bindTexture(&imgTex, img);\r
+ }\r
+ void bindSumTex(DevMem2D_<uint> sum)\r
+ {\r
+ bindTexture(&sumTex, sum);\r
+ }\r
+ void bindMaskSumTex(DevMem2D_<uint> maskSum)\r
+ {\r
+ bindTexture(&maskSumTex, maskSum);\r
+ }\r
\r
- d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));\r
- }\r
+ template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
+ {\r
+ #if __CUDA_ARCH__ >= 200\r
+ typedef double real_t; \r
+ #else\r
+ typedef float real_t;\r
+ #endif\r
\r
- return (float)d;\r
-}\r
+ float ratio = (float)newSize / oldSize;\r
+ \r
+ real_t d = 0;\r
\r
-////////////////////////////////////////////////////////////////////////\r
-// Hessian\r
+ #pragma unroll\r
+ for (int k = 0; k < N; ++k)\r
+ {\r
+ int dx1 = __float2int_rn(ratio * src[k][0]);\r
+ int dy1 = __float2int_rn(ratio * src[k][1]);\r
+ int dx2 = __float2int_rn(ratio * src[k][2]);\r
+ int dy2 = __float2int_rn(ratio * src[k][3]);\r
+\r
+ real_t t = 0;\r
+ t += tex2D(sumTex, x + dx1, y + dy1);\r
+ t -= tex2D(sumTex, x + dx1, y + dy2);\r
+ t -= tex2D(sumTex, x + dx2, y + dy1);\r
+ t += tex2D(sumTex, x + dx2, y + dy2);\r
+\r
+ d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));\r
+ }\r
\r
-__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };\r
-__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };\r
-__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };\r
+ return (float)d;\r
+ }\r
\r
-__host__ __device__ __forceinline__ int calcSize(int octave, int layer)\r
-{\r
- /* Wavelet size at first layer of first octave. */\r
- const int HAAR_SIZE0 = 9;\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // Hessian\r
\r
- /* Wavelet size increment between layers. This should be an even number,\r
- such that the wavelet sizes in an octave are either all even or all odd.\r
- This ensures that when looking for the neighbours of a sample, the layers\r
- above and below are aligned correctly. */\r
- const int HAAR_SIZE_INC = 6;\r
+ __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };\r
+ __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };\r
+ __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };\r
\r
- return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
-}\r
+ __host__ __device__ __forceinline__ int calcSize(int octave, int layer)\r
+ {\r
+ /* Wavelet size at first layer of first octave. */\r
+ const int HAAR_SIZE0 = 9;\r
\r
-__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)\r
-{\r
- // Determine the indices\r
- const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);\r
- const int blockIdx_y = blockIdx.y % gridDim_y;\r
- const int blockIdx_z = blockIdx.y / gridDim_y;\r
+ /* Wavelet size increment between layers. This should be an even number,\r
+ such that the wavelet sizes in an octave are either all even or all odd.\r
+ This ensures that when looking for the neighbours of a sample, the layers\r
+ above and below are aligned correctly. */\r
+ const int HAAR_SIZE_INC = 6;\r
\r
- const int j = threadIdx.x + blockIdx.x * blockDim.x;\r
- const int i = threadIdx.y + blockIdx_y * blockDim.y;\r
- const int layer = blockIdx_z;\r
+ return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
+ }\r
\r
- const int size = calcSize(c_octave, layer);\r
+ __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)\r
+ {\r
+ // Determine the indices\r
+ const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);\r
+ const int blockIdx_y = blockIdx.y % gridDim_y;\r
+ const int blockIdx_z = blockIdx.y / gridDim_y;\r
\r
- const int samples_i = 1 + ((c_img_rows - size) >> c_octave);\r
- const int samples_j = 1 + ((c_img_cols - size) >> c_octave);\r
+ const int j = threadIdx.x + blockIdx.x * blockDim.x;\r
+ const int i = threadIdx.y + blockIdx_y * blockDim.y;\r
+ const int layer = blockIdx_z;\r
\r
- // Ignore pixels where some of the kernel is outside the image\r
- const int margin = (size >> 1) >> c_octave;\r
+ const int size = calcSize(c_octave, layer);\r
\r
- if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)\r
- {\r
- const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);\r
- const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);\r
- const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);\r
+ const int samples_i = 1 + ((c_img_rows - size) >> c_octave);\r
+ const int samples_j = 1 + ((c_img_cols - size) >> c_octave);\r
\r
- det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;\r
- trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;\r
- }\r
-}\r
+ // Ignore pixels where some of the kernel is outside the image\r
+ const int margin = (size >> 1) >> c_octave;\r
\r
-void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)\r
-{\r
- const int min_size = calcSize(octave, 0);\r
- const int max_samples_i = 1 + ((img_rows - min_size) >> octave);\r
- const int max_samples_j = 1 + ((img_cols - min_size) >> octave);\r
+ if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)\r
+ {\r
+ const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);\r
+ const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);\r
+ const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);\r
\r
- dim3 threads(16, 16);\r
+ det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;\r
+ trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;\r
+ }\r
+ }\r
\r
- dim3 grid;\r
- grid.x = divUp(max_samples_j, threads.x);\r
- grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);\r
+ void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)\r
+ {\r
+ const int min_size = calcSize(octave, 0);\r
+ const int max_samples_i = 1 + ((img_rows - min_size) >> octave);\r
+ const int max_samples_j = 1 + ((img_cols - min_size) >> octave);\r
\r
- icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);\r
- cudaSafeCall( cudaGetLastError() );\r
+ dim3 threads(16, 16);\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ dim3 grid;\r
+ grid.x = divUp(max_samples_j, threads.x);\r
+ grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);\r
\r
-////////////////////////////////////////////////////////////////////////\r
-// NONMAX\r
+ icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-__constant__ float c_DM[5] = {0, 0, 9, 9, 1};\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-struct WithMask\r
-{\r
- static __device__ bool check(int sum_i, int sum_j, int size)\r
- {\r
- float ratio = (float)size / 9.0f;\r
- \r
- float d = 0;\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // NONMAX\r
\r
- int dx1 = __float2int_rn(ratio * c_DM[0]);\r
- int dy1 = __float2int_rn(ratio * c_DM[1]);\r
- int dx2 = __float2int_rn(ratio * c_DM[2]);\r
- int dy2 = __float2int_rn(ratio * c_DM[3]);\r
+ __constant__ float c_DM[5] = {0, 0, 9, 9, 1};\r
\r
- float t = 0;\r
- t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
- t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
- t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
- t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
+ struct WithMask\r
+ {\r
+ static __device__ bool check(int sum_i, int sum_j, int size)\r
+ {\r
+ float ratio = (float)size / 9.0f;\r
+ \r
+ float d = 0;\r
\r
- d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
+ int dx1 = __float2int_rn(ratio * c_DM[0]);\r
+ int dy1 = __float2int_rn(ratio * c_DM[1]);\r
+ int dx2 = __float2int_rn(ratio * c_DM[2]);\r
+ int dy2 = __float2int_rn(ratio * c_DM[3]);\r
\r
- return (d >= 0.5f);\r
- }\r
-};\r
+ float t = 0;\r
+ t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
+ t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
+ t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
+ t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
\r
-template <typename Mask>\r
-__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)\r
-{\r
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+ d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
\r
- extern __shared__ float N9[];\r
+ return (d >= 0.5f);\r
+ }\r
+ };\r
\r
- // The hidx variables are the indices to the hessian buffer.\r
- const int gridDim_y = gridDim.y / c_nOctaveLayers;\r
- const int blockIdx_y = blockIdx.y % gridDim_y;\r
- const int blockIdx_z = blockIdx.y / gridDim_y;\r
+ template <typename Mask>\r
+ __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)\r
+ {\r
+ #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
- const int layer = blockIdx_z + 1;\r
+ extern __shared__ float N9[];\r
\r
- const int size = calcSize(c_octave, layer);\r
+ // The hidx variables are the indices to the hessian buffer.\r
+ const int gridDim_y = gridDim.y / c_nOctaveLayers;\r
+ const int blockIdx_y = blockIdx.y % gridDim_y;\r
+ const int blockIdx_z = blockIdx.y / gridDim_y;\r
\r
- // Ignore pixels without a 3x3x3 neighbourhood in the layer above\r
- const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;\r
+ const int layer = blockIdx_z + 1;\r
\r
- const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;\r
- const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;\r
+ const int size = calcSize(c_octave, layer);\r
\r
- // Is this thread within the hessian buffer?\r
- const int zoff = blockDim.x * blockDim.y;\r
- const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;\r
- N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
- N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
- N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
- __syncthreads();\r
+ // Ignore pixels without a 3x3x3 neighbourhood in the layer above\r
+ const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;\r
\r
- if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)\r
- {\r
- float val0 = N9[localLin];\r
+ const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;\r
+ const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;\r
\r
- if (val0 > c_hessianThreshold)\r
- {\r
- // Coordinates for the start of the wavelet in the sum image. There\r
- // is some integer division involved, so don't try to simplify this\r
- // (cancel out sampleStep) without checking the result is the same\r
- const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;\r
- const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;\r
+ // Is this thread within the hessian buffer?\r
+ const int zoff = blockDim.x * blockDim.y;\r
+ const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;\r
+ N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+ N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+ N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+ __syncthreads();\r
\r
- if (Mask::check(sum_i, sum_j, size))\r
+ if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)\r
{\r
- // Check to see if we have a max (in its 26 neighbours)\r
- const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]\r
- && val0 > N9[localLin - blockDim.x - zoff]\r
- && val0 > N9[localLin + 1 - blockDim.x - zoff]\r
- && val0 > N9[localLin - 1 - zoff]\r
- && val0 > N9[localLin - zoff]\r
- && val0 > N9[localLin + 1 - zoff]\r
- && val0 > N9[localLin - 1 + blockDim.x - zoff]\r
- && val0 > N9[localLin + blockDim.x - zoff]\r
- && val0 > N9[localLin + 1 + blockDim.x - zoff]\r
-\r
- && val0 > N9[localLin - 1 - blockDim.x]\r
- && val0 > N9[localLin - blockDim.x]\r
- && val0 > N9[localLin + 1 - blockDim.x]\r
- && val0 > N9[localLin - 1 ]\r
- && val0 > N9[localLin + 1 ]\r
- && val0 > N9[localLin - 1 + blockDim.x]\r
- && val0 > N9[localLin + blockDim.x]\r
- && val0 > N9[localLin + 1 + blockDim.x]\r
-\r
- && val0 > N9[localLin - 1 - blockDim.x + zoff]\r
- && val0 > N9[localLin - blockDim.x + zoff]\r
- && val0 > N9[localLin + 1 - blockDim.x + zoff]\r
- && val0 > N9[localLin - 1 + zoff]\r
- && val0 > N9[localLin + zoff]\r
- && val0 > N9[localLin + 1 + zoff]\r
- && val0 > N9[localLin - 1 + blockDim.x + zoff]\r
- && val0 > N9[localLin + blockDim.x + zoff]\r
- && val0 > N9[localLin + 1 + blockDim.x + zoff]\r
- ;\r
-\r
- if(condmax)\r
+ float val0 = N9[localLin];\r
+\r
+ if (val0 > c_hessianThreshold)\r
{\r
- unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);\r
+ // Coordinates for the start of the wavelet in the sum image. There\r
+ // is some integer division involved, so don't try to simplify this\r
+ // (cancel out sampleStep) without checking the result is the same\r
+ const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;\r
+ const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;\r
\r
- if (ind < c_max_candidates)\r
+ if (Mask::check(sum_i, sum_j, size))\r
{\r
- const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);\r
-\r
- maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);\r
+ // Check to see if we have a max (in its 26 neighbours)\r
+ const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]\r
+ && val0 > N9[localLin - blockDim.x - zoff]\r
+ && val0 > N9[localLin + 1 - blockDim.x - zoff]\r
+ && val0 > N9[localLin - 1 - zoff]\r
+ && val0 > N9[localLin - zoff]\r
+ && val0 > N9[localLin + 1 - zoff]\r
+ && val0 > N9[localLin - 1 + blockDim.x - zoff]\r
+ && val0 > N9[localLin + blockDim.x - zoff]\r
+ && val0 > N9[localLin + 1 + blockDim.x - zoff]\r
+\r
+ && val0 > N9[localLin - 1 - blockDim.x]\r
+ && val0 > N9[localLin - blockDim.x]\r
+ && val0 > N9[localLin + 1 - blockDim.x]\r
+ && val0 > N9[localLin - 1 ]\r
+ && val0 > N9[localLin + 1 ]\r
+ && val0 > N9[localLin - 1 + blockDim.x]\r
+ && val0 > N9[localLin + blockDim.x]\r
+ && val0 > N9[localLin + 1 + blockDim.x]\r
+\r
+ && val0 > N9[localLin - 1 - blockDim.x + zoff]\r
+ && val0 > N9[localLin - blockDim.x + zoff]\r
+ && val0 > N9[localLin + 1 - blockDim.x + zoff]\r
+ && val0 > N9[localLin - 1 + zoff]\r
+ && val0 > N9[localLin + zoff]\r
+ && val0 > N9[localLin + 1 + zoff]\r
+ && val0 > N9[localLin - 1 + blockDim.x + zoff]\r
+ && val0 > N9[localLin + blockDim.x + zoff]\r
+ && val0 > N9[localLin + 1 + blockDim.x + zoff]\r
+ ;\r
+\r
+ if(condmax)\r
+ {\r
+ unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);\r
+\r
+ if (ind < c_max_candidates)\r
+ {\r
+ const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);\r
+\r
+ maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);\r
+ }\r
+ }\r
}\r
}\r
}\r
- }\r
- }\r
\r
- #endif\r
-}\r
+ #endif\r
+ }\r
\r
-void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
- int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)\r
-{\r
- const int layer_rows = img_rows >> octave;\r
- const int layer_cols = img_cols >> octave;\r
+ void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
+ int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)\r
+ {\r
+ const int layer_rows = img_rows >> octave;\r
+ const int layer_cols = img_cols >> octave;\r
\r
- const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;\r
+ const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;\r
\r
- dim3 threads(16, 16);\r
+ dim3 threads(16, 16);\r
\r
- dim3 grid;\r
- grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);\r
- grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;\r
+ dim3 grid;\r
+ grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);\r
+ grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;\r
\r
- const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);\r
+ const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);\r
\r
- if (use_mask)\r
- icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
- else\r
- icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
+ if (use_mask)\r
+ icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
+ else\r
+ icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
\r
- cudaSafeCall( cudaGetLastError() );\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
-////////////////////////////////////////////////////////////////////////\r
-// INTERPOLATION\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // INTERPOLATION\r
\r
-__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,\r
- float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,\r
- unsigned int* featureCounter)\r
-{\r
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+ __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,\r
+ float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,\r
+ unsigned int* featureCounter)\r
+ {\r
+ #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
- const int4 maxPos = maxPosBuffer[blockIdx.x];\r
+ const int4 maxPos = maxPosBuffer[blockIdx.x];\r
\r
- const int j = maxPos.x - 1 + threadIdx.x;\r
- const int i = maxPos.y - 1 + threadIdx.y;\r
- const int layer = maxPos.z - 1 + threadIdx.z;\r
+ const int j = maxPos.x - 1 + threadIdx.x;\r
+ const int i = maxPos.y - 1 + threadIdx.y;\r
+ const int layer = maxPos.z - 1 + threadIdx.z;\r
\r
- __shared__ float N9[3][3][3];\r
+ __shared__ float N9[3][3][3];\r
\r
- N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];\r
- __syncthreads();\r
+ N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];\r
+ __syncthreads();\r
\r
- if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)\r
- {\r
- __shared__ float dD[3];\r
-\r
- //dx\r
- dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);\r
- //dy\r
- dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);\r
- //ds\r
- dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);\r
-\r
- __shared__ float H[3][3];\r
-\r
- //dxx\r
- H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];\r
- //dxy\r
- H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);\r
- //dxs\r
- H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);\r
- //dyx = dxy\r
- H[1][0] = H[0][1];\r
- //dyy\r
- H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];\r
- //dys\r
- H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);\r
- //dsx = dxs\r
- H[2][0] = H[0][2];\r
- //dsy = dys\r
- H[2][1] = H[1][2];\r
- //dss\r
- H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];\r
-\r
- __shared__ float x[3];\r
-\r
- if (solve3x3(H, dD, x))\r
- {\r
- if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)\r
+ if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)\r
{\r
- // if the step is within the interpolation region, perform it\r
- \r
- const int size = calcSize(c_octave, maxPos.z);\r
-\r
- const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;\r
- const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;\r
- \r
- const float center_i = sum_i + (float)(size - 1) / 2;\r
- const float center_j = sum_j + (float)(size - 1) / 2;\r
-\r
- const float px = center_j + x[0] * (1 << c_octave);\r
- const float py = center_i + x[1] * (1 << c_octave);\r
+ __shared__ float dD[3];\r
+\r
+ //dx\r
+ dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);\r
+ //dy\r
+ dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);\r
+ //ds\r
+ dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);\r
+\r
+ __shared__ float H[3][3];\r
+\r
+ //dxx\r
+ H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];\r
+ //dxy\r
+ H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);\r
+ //dxs\r
+ H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);\r
+ //dyx = dxy\r
+ H[1][0] = H[0][1];\r
+ //dyy\r
+ H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];\r
+ //dys\r
+ H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);\r
+ //dsx = dxs\r
+ H[2][0] = H[0][2];\r
+ //dsy = dys\r
+ H[2][1] = H[1][2];\r
+ //dss\r
+ H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];\r
+\r
+ __shared__ float x[3];\r
+\r
+ if (solve3x3(H, dD, x))\r
+ {\r
+ if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)\r
+ {\r
+ // if the step is within the interpolation region, perform it\r
+ \r
+ const int size = calcSize(c_octave, maxPos.z);\r
+\r
+ const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;\r
+ const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;\r
+ \r
+ const float center_i = sum_i + (float)(size - 1) / 2;\r
+ const float center_j = sum_j + (float)(size - 1) / 2;\r
+\r
+ const float px = center_j + x[0] * (1 << c_octave);\r
+ const float py = center_i + x[1] * (1 << c_octave);\r
+\r
+ const int ds = size - calcSize(c_octave, maxPos.z - 1);\r
+ const float psize = roundf(size + x[2] * ds);\r
+\r
+ /* The sampling intervals and wavelet sized for selecting an orientation\r
+ and building the keypoint descriptor are defined relative to 's' */\r
+ const float s = psize * 1.2f / 9.0f;\r
+\r
+ /* To find the dominant orientation, the gradients in x and y are\r
+ sampled in a circle of radius 6s using wavelets of size 4s.\r
+ We ensure the gradient wavelet size is even to ensure the\r
+ wavelet pattern is balanced and symmetric around its center */\r
+ const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
+\r
+ // check when grad_wav_size is too big\r
+ if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
+ {\r
+ // Get a new feature index.\r
+ unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);\r
+\r
+ if (ind < c_max_features)\r
+ {\r
+ featureX[ind] = px;\r
+ featureY[ind] = py;\r
+ featureLaplacian[ind] = maxPos.w;\r
+ featureSize[ind] = psize;\r
+ featureHessian[ind] = N9[1][1][1];\r
+ }\r
+ } // grad_wav_size check\r
+ } // If the subpixel interpolation worked\r
+ }\r
+ } // If this is thread 0.\r
\r
- const int ds = size - calcSize(c_octave, maxPos.z - 1);\r
- const float psize = roundf(size + x[2] * ds);\r
+ #endif\r
+ }\r
\r
- /* The sampling intervals and wavelet sized for selecting an orientation\r
- and building the keypoint descriptor are defined relative to 's' */\r
- const float s = psize * 1.2f / 9.0f;\r
+ void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
+ float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
+ unsigned int* featureCounter)\r
+ {\r
+ dim3 threads;\r
+ threads.x = 3;\r
+ threads.y = 3;\r
+ threads.z = 3;\r
\r
- /* To find the dominant orientation, the gradients in x and y are\r
- sampled in a circle of radius 6s using wavelets of size 4s.\r
- We ensure the gradient wavelet size is even to ensure the\r
- wavelet pattern is balanced and symmetric around its center */\r
- const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
+ dim3 grid;\r
+ grid.x = maxCounter;\r
\r
- // check when grad_wav_size is too big\r
- if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
- {\r
- // Get a new feature index.\r
- unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);\r
+ icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- if (ind < c_max_features)\r
- {\r
- featureX[ind] = px;\r
- featureY[ind] = py;\r
- featureLaplacian[ind] = maxPos.w;\r
- featureSize[ind] = psize;\r
- featureHessian[ind] = N9[1][1][1];\r
- }\r
- } // grad_wav_size check\r
- } // If the subpixel interpolation worked\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- } // If this is thread 0.\r
\r
- #endif\r
-}\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // Orientation\r
\r
-void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
- float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
- unsigned int* featureCounter)\r
-{\r
- dim3 threads;\r
- threads.x = 3;\r
- threads.y = 3;\r
- threads.z = 3;\r
+ #define ORI_SEARCH_INC 5\r
+ #define ORI_WIN 60\r
+ #define ORI_SAMPLES 113\r
\r
- dim3 grid;\r
- grid.x = maxCounter;\r
+ __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};\r
+ __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};\r
+ __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};\r
\r
- icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);\r
- cudaSafeCall( cudaGetLastError() );\r
+ __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
+ __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
+ { \r
+ #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
-////////////////////////////////////////////////////////////////////////\r
-// Orientation\r
+ __shared__ float s_X[128];\r
+ __shared__ float s_Y[128];\r
+ __shared__ float s_angle[128];\r
\r
-#define ORI_SEARCH_INC 5\r
-#define ORI_WIN 60\r
-#define ORI_SAMPLES 113\r
+ __shared__ float s_sum[32 * 4];\r
\r
-__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};\r
-__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};\r
-__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};\r
+ /* The sampling intervals and wavelet sized for selecting an orientation\r
+ and building the keypoint descriptor are defined relative to 's' */\r
+ const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;\r
\r
-__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
-__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
+ /* To find the dominant orientation, the gradients in x and y are\r
+ sampled in a circle of radius 6s using wavelets of size 4s.\r
+ We ensure the gradient wavelet size is even to ensure the\r
+ wavelet pattern is balanced and symmetric around its center */\r
+ const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
\r
-__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
-{ \r
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+ // check when grad_wav_size is too big\r
+ if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
+ {\r
+ // Calc X, Y, angle and store it to shared memory\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- __shared__ float s_X[128];\r
- __shared__ float s_Y[128];\r
- __shared__ float s_angle[128];\r
+ float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
\r
- __shared__ float s_sum[32 * 4];\r
+ if (tid < ORI_SAMPLES)\r
+ {\r
+ const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
+ const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
+ const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
\r
- /* The sampling intervals and wavelet sized for selecting an orientation\r
- and building the keypoint descriptor are defined relative to 's' */\r
- const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;\r
+ if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
+ {\r
+ X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
+ Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
+ \r
+ angle = atan2f(Y, X);\r
+ if (angle < 0)\r
+ angle += 2.0f * CV_PI;\r
+ angle *= 180.0f / CV_PI;\r
+ }\r
+ }\r
+ s_X[tid] = X;\r
+ s_Y[tid] = Y;\r
+ s_angle[tid] = angle;\r
+ __syncthreads();\r
\r
- /* To find the dominant orientation, the gradients in x and y are\r
- sampled in a circle of radius 6s using wavelets of size 4s.\r
- We ensure the gradient wavelet size is even to ensure the\r
- wavelet pattern is balanced and symmetric around its center */\r
- const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
+ float bestx = 0, besty = 0, best_mod = 0;\r
\r
- // check when grad_wav_size is too big\r
- if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
- {\r
- // Calc X, Y, angle and store it to shared memory\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ #pragma unroll\r
+ for (int i = 0; i < 18; ++i)\r
+ {\r
+ const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;\r
\r
- float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
+ float sumx = 0.0f, sumy = 0.0f;\r
+ int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+ {\r
+ sumx = s_X[threadIdx.x];\r
+ sumy = s_Y[threadIdx.x];\r
+ }\r
+ d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+ {\r
+ sumx += s_X[threadIdx.x + 32];\r
+ sumy += s_Y[threadIdx.x + 32];\r
+ }\r
+ d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+ {\r
+ sumx += s_X[threadIdx.x + 64];\r
+ sumy += s_Y[threadIdx.x + 64];\r
+ }\r
+ d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+ {\r
+ sumx += s_X[threadIdx.x + 96];\r
+ sumy += s_Y[threadIdx.x + 96];\r
+ }\r
\r
- if (tid < ORI_SAMPLES)\r
- {\r
- const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
- const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
- const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
+ float* s_sum_row = s_sum + threadIdx.y * 32;\r
\r
- if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
- {\r
- X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
- Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
- \r
- angle = atan2f(Y, X);\r
- if (angle < 0)\r
- angle += 2.0f * CV_PI;\r
- angle *= 180.0f / CV_PI;\r
- }\r
- }\r
- s_X[tid] = X;\r
- s_Y[tid] = Y;\r
- s_angle[tid] = angle;\r
- __syncthreads();\r
+ device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());\r
+ device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());\r
\r
- float bestx = 0, besty = 0, best_mod = 0;\r
+ const float temp_mod = sumx * sumx + sumy * sumy;\r
+ if (temp_mod > best_mod)\r
+ {\r
+ best_mod = temp_mod;\r
+ bestx = sumx;\r
+ besty = sumy;\r
+ }\r
+ }\r
\r
- #pragma unroll\r
- for (int i = 0; i < 18; ++i)\r
- {\r
- const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;\r
+ if (threadIdx.x == 0)\r
+ {\r
+ s_X[threadIdx.y] = bestx;\r
+ s_Y[threadIdx.y] = besty;\r
+ s_angle[threadIdx.y] = best_mod;\r
+ }\r
+ __syncthreads();\r
\r
- float sumx = 0.0f, sumy = 0.0f;\r
- int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);\r
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
- {\r
- sumx = s_X[threadIdx.x];\r
- sumy = s_Y[threadIdx.x];\r
- }\r
- d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
- {\r
- sumx += s_X[threadIdx.x + 32];\r
- sumy += s_Y[threadIdx.x + 32];\r
- }\r
- d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
- {\r
- sumx += s_X[threadIdx.x + 64];\r
- sumy += s_Y[threadIdx.x + 64];\r
- }\r
- d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
- {\r
- sumx += s_X[threadIdx.x + 96];\r
- sumy += s_Y[threadIdx.x + 96];\r
- }\r
+ if (threadIdx.x < 2 && threadIdx.y == 0)\r
+ {\r
+ volatile float* v_x = s_X;\r
+ volatile float* v_y = s_Y;\r
+ volatile float* v_mod = s_angle;\r
+\r
+ bestx = v_x[threadIdx.x];\r
+ besty = v_y[threadIdx.x];\r
+ best_mod = v_mod[threadIdx.x];\r
\r
- float* s_sum_row = s_sum + threadIdx.y * 32;\r
+ float temp_mod = v_mod[threadIdx.x + 2];\r
+ if (temp_mod > best_mod)\r
+ {\r
+ v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];\r
+ v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];\r
+ v_mod[threadIdx.x] = best_mod = temp_mod;\r
+ }\r
+ temp_mod = v_mod[threadIdx.x + 1];\r
+ if (temp_mod > best_mod)\r
+ {\r
+ v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];\r
+ v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];\r
+ }\r
+ }\r
\r
- device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());\r
- device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());\r
+ if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)\r
+ {\r
+ float kp_dir = atan2f(besty, bestx);\r
+ if (kp_dir < 0)\r
+ kp_dir += 2.0f * CV_PI;\r
+ kp_dir *= 180.0f / CV_PI;\r
\r
- const float temp_mod = sumx * sumx + sumy * sumy;\r
- if (temp_mod > best_mod)\r
- {\r
- best_mod = temp_mod;\r
- bestx = sumx;\r
- besty = sumy;\r
+ featureDir[blockIdx.x] = kp_dir;\r
+ }\r
}\r
- }\r
\r
- if (threadIdx.x == 0)\r
- {\r
- s_X[threadIdx.y] = bestx;\r
- s_Y[threadIdx.y] = besty;\r
- s_angle[threadIdx.y] = best_mod;\r
+ #endif\r
}\r
- __syncthreads();\r
-\r
- if (threadIdx.x < 2 && threadIdx.y == 0)\r
- {\r
- volatile float* v_x = s_X;\r
- volatile float* v_y = s_Y;\r
- volatile float* v_mod = s_angle;\r
\r
- bestx = v_x[threadIdx.x];\r
- besty = v_y[threadIdx.x];\r
- best_mod = v_mod[threadIdx.x];\r
+ #undef ORI_SEARCH_INC\r
+ #undef ORI_WIN\r
+ #undef ORI_SAMPLES\r
\r
- float temp_mod = v_mod[threadIdx.x + 2];\r
- if (temp_mod > best_mod)\r
- {\r
- v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];\r
- v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];\r
- v_mod[threadIdx.x] = best_mod = temp_mod;\r
- }\r
- temp_mod = v_mod[threadIdx.x + 1];\r
- if (temp_mod > best_mod)\r
- {\r
- v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];\r
- v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];\r
- }\r
- }\r
-\r
- if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)\r
+ void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
{\r
- float kp_dir = atan2f(besty, bestx);\r
- if (kp_dir < 0)\r
- kp_dir += 2.0f * CV_PI;\r
- kp_dir *= 180.0f / CV_PI;\r
+ dim3 threads;\r
+ threads.x = 32;\r
+ threads.y = 4;\r
\r
- featureDir[blockIdx.x] = kp_dir;\r
- }\r
- }\r
+ dim3 grid;\r
+ grid.x = nFeatures;\r
\r
- #endif\r
-}\r
-\r
-#undef ORI_SEARCH_INC\r
-#undef ORI_WIN\r
-#undef ORI_SAMPLES\r
-\r
-void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
-{\r
- dim3 threads;\r
- threads.x = 32;\r
- threads.y = 4;\r
+ icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- dim3 grid;\r
- grid.x = nFeatures;\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);\r
- cudaSafeCall( cudaGetLastError() );\r
+ ////////////////////////////////////////////////////////////////////////\r
+ // Descriptors\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
+ #define PATCH_SZ 20\r
\r
-////////////////////////////////////////////////////////////////////////\r
-// Descriptors\r
+ __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = \r
+ {\r
+ 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, \r
+ 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+ 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+ 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+ 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+ 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+ 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+ 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+ 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+ 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+ 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+ 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+ 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+ 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+ 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+ 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+ 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+ 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+ 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+ 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
+ };\r
+\r
+ struct WinReader\r
+ {\r
+ typedef uchar elem_type;\r
\r
-#define PATCH_SZ 20\r
+ __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : \r
+ centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)\r
+ {\r
+ }\r
\r
-__constant__ float c_DW[PATCH_SZ * PATCH_SZ] = \r
-{\r
- 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, \r
- 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
- 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
- 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
- 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
- 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
- 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
- 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
- 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
- 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
- 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
- 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
- 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
- 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
- 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
- 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
- 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
- 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
- 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
- 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
-};\r
-\r
-struct WinReader\r
-{\r
- typedef uchar elem_type;\r
+ __device__ __forceinline__ uchar operator ()(int i, int j) const\r
+ {\r
+ float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
+ float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
\r
- __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : \r
- centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)\r
- {\r
- }\r
+ return tex2D(imgTex, pixel_x, pixel_y);\r
+ }\r
\r
- __device__ __forceinline__ uchar operator ()(int i, int j) const\r
- {\r
- float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
- float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
+ float centerX; \r
+ float centerY;\r
+ float win_offset; \r
+ float cos_dir; \r
+ float sin_dir;\r
+ };\r
\r
- return tex2D(imgTex, pixel_x, pixel_y);\r
- }\r
+ __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
+ const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+ {\r
+ __shared__ float s_PATCH[6][6];\r
\r
- float centerX; \r
- float centerY;\r
- float win_offset; \r
- float cos_dir; \r
- float sin_dir;\r
-};\r
+ const float centerX = featureX[blockIdx.x];\r
+ const float centerY = featureY[blockIdx.x];\r
+ const float size = featureSize[blockIdx.x];\r
+ const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);\r
\r
-__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
- const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
-{\r
- __shared__ float s_PATCH[6][6];\r
+ /* The sampling intervals and wavelet sized for selecting an orientation\r
+ and building the keypoint descriptor are defined relative to 's' */\r
+ const float s = size * 1.2f / 9.0f;\r
\r
- const float centerX = featureX[blockIdx.x];\r
- const float centerY = featureY[blockIdx.x];\r
- const float size = featureSize[blockIdx.x];\r
- const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);\r
+ /* Extract a window of pixels around the keypoint of size 20s */\r
+ const int win_size = (int)((PATCH_SZ + 1) * s);\r
\r
- /* The sampling intervals and wavelet sized for selecting an orientation\r
- and building the keypoint descriptor are defined relative to 's' */\r
- const float s = size * 1.2f / 9.0f;\r
+ float sin_dir;\r
+ float cos_dir;\r
+ sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
\r
- /* Extract a window of pixels around the keypoint of size 20s */\r
- const int win_size = (int)((PATCH_SZ + 1) * s);\r
+ /* Nearest neighbour version (faster) */\r
+ const float win_offset = -(float)(win_size - 1) / 2; \r
\r
- float sin_dir;\r
- float cos_dir;\r
- sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
+ // Compute sampling points\r
+ // since grids are 2D, need to compute xBlock and yBlock indices\r
+ const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4\r
+ const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
+ const int xIndex = xBlock * 5 + threadIdx.x;\r
+ const int yIndex = yBlock * 5 + threadIdx.y;\r
\r
- /* Nearest neighbour version (faster) */\r
- const float win_offset = -(float)(win_size - 1) / 2; \r
+ const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;\r
+ const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;\r
\r
- // Compute sampling points\r
- // since grids are 2D, need to compute xBlock and yBlock indices\r
- const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4\r
- const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
- const int xIndex = xBlock * 5 + threadIdx.x;\r
- const int yIndex = yBlock * 5 + threadIdx.y;\r
+ LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));\r
\r
- const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;\r
- const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;\r
+ s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);\r
\r
- LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));\r
+ __syncthreads();\r
\r
- s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);\r
+ if (threadIdx.x < 5 && threadIdx.y < 5)\r
+ {\r
+ const int tid = threadIdx.y * 5 + threadIdx.x;\r
\r
- __syncthreads();\r
+ const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
\r
- if (threadIdx.x < 5 && threadIdx.y < 5)\r
- {\r
- const int tid = threadIdx.y * 5 + threadIdx.x;\r
+ const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;\r
+ const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;\r
\r
- const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
+ s_dx_bin[tid] = vx;\r
+ s_dy_bin[tid] = vy;\r
+ }\r
+ }\r
\r
- const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;\r
- const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;\r
+ __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
+ {\r
+ // first step is to reduce from 25 to 16\r
+ if (tid < 9) // use 9 threads\r
+ {\r
+ sdata1[tid] += sdata1[tid + 16];\r
+ sdata2[tid] += sdata2[tid + 16];\r
+ sdata3[tid] += sdata3[tid + 16];\r
+ sdata4[tid] += sdata4[tid + 16];\r
+ }\r
\r
- s_dx_bin[tid] = vx;\r
- s_dy_bin[tid] = vy;\r
- }\r
-}\r
+ // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
+ if (tid < 8)\r
+ {\r
+ sdata1[tid] += sdata1[tid + 8];\r
+ sdata1[tid] += sdata1[tid + 4];\r
+ sdata1[tid] += sdata1[tid + 2];\r
+ sdata1[tid] += sdata1[tid + 1];\r
+\r
+ sdata2[tid] += sdata2[tid + 8];\r
+ sdata2[tid] += sdata2[tid + 4];\r
+ sdata2[tid] += sdata2[tid + 2];\r
+ sdata2[tid] += sdata2[tid + 1];\r
+\r
+ sdata3[tid] += sdata3[tid + 8];\r
+ sdata3[tid] += sdata3[tid + 4];\r
+ sdata3[tid] += sdata3[tid + 2];\r
+ sdata3[tid] += sdata3[tid + 1];\r
+\r
+ sdata4[tid] += sdata4[tid + 8];\r
+ sdata4[tid] += sdata4[tid + 4];\r
+ sdata4[tid] += sdata4[tid + 2];\r
+ sdata4[tid] += sdata4[tid + 1];\r
+ }\r
+ }\r
\r
-__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
-{\r
- // first step is to reduce from 25 to 16\r
- if (tid < 9) // use 9 threads\r
- {\r
- sdata1[tid] += sdata1[tid + 16];\r
- sdata2[tid] += sdata2[tid + 16];\r
- sdata3[tid] += sdata3[tid + 16];\r
- sdata4[tid] += sdata4[tid + 16];\r
- }\r
-\r
- // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
- if (tid < 8)\r
- {\r
- sdata1[tid] += sdata1[tid + 8];\r
- sdata1[tid] += sdata1[tid + 4];\r
- sdata1[tid] += sdata1[tid + 2];\r
- sdata1[tid] += sdata1[tid + 1];\r
-\r
- sdata2[tid] += sdata2[tid + 8];\r
- sdata2[tid] += sdata2[tid + 4];\r
- sdata2[tid] += sdata2[tid + 2];\r
- sdata2[tid] += sdata2[tid + 1];\r
-\r
- sdata3[tid] += sdata3[tid + 8];\r
- sdata3[tid] += sdata3[tid + 4];\r
- sdata3[tid] += sdata3[tid + 2];\r
- sdata3[tid] += sdata3[tid + 1];\r
-\r
- sdata4[tid] += sdata4[tid + 8];\r
- sdata4[tid] += sdata4[tid + 4];\r
- sdata4[tid] += sdata4[tid + 2];\r
- sdata4[tid] += sdata4[tid + 1];\r
- }\r
-}\r
-\r
-__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
-{\r
- // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[25];\r
- __shared__ float sdy[25];\r
- __shared__ float sdxabs[25];\r
- __shared__ float sdyabs[25];\r
+ __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+ {\r
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+ __shared__ float sdx[25];\r
+ __shared__ float sdy[25];\r
+ __shared__ float sdxabs[25];\r
+ __shared__ float sdyabs[25];\r
\r
- calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
- __syncthreads();\r
+ calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
+ __syncthreads();\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- if (tid < 25)\r
- {\r
- sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array\r
- sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array\r
- __syncthreads();\r
+ if (tid < 25)\r
+ {\r
+ sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array\r
+ sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array\r
+ __syncthreads();\r
\r
- reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
- __syncthreads();\r
+ reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
+ __syncthreads();\r
\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
\r
- // write dx, dy, |dx|, |dy|\r
- if (tid == 0)\r
- {\r
- descriptors_block[0] = sdx[0];\r
- descriptors_block[1] = sdy[0];\r
- descriptors_block[2] = sdxabs[0];\r
- descriptors_block[3] = sdyabs[0];\r
+ // write dx, dy, |dx|, |dy|\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sdx[0];\r
+ descriptors_block[1] = sdy[0];\r
+ descriptors_block[2] = sdxabs[0];\r
+ descriptors_block[3] = sdyabs[0];\r
+ }\r
+ }\r
}\r
- }\r
-}\r
\r
-__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
-{\r
- // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[25];\r
- __shared__ float sdy[25];\r
+ __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+ {\r
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+ __shared__ float sdx[25];\r
+ __shared__ float sdy[25];\r
\r
- // sum (reduce) 5x5 area response\r
- __shared__ float sd1[25];\r
- __shared__ float sd2[25];\r
- __shared__ float sdabs1[25];\r
- __shared__ float sdabs2[25];\r
+ // sum (reduce) 5x5 area response\r
+ __shared__ float sd1[25];\r
+ __shared__ float sd2[25];\r
+ __shared__ float sdabs1[25];\r
+ __shared__ float sdabs2[25];\r
\r
- calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
- __syncthreads();\r
+ calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
+ __syncthreads();\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- if (tid < 25)\r
- {\r
- if (sdy[tid] >= 0)\r
- {\r
- sd1[tid] = sdx[tid];\r
- sdabs1[tid] = ::fabs(sdx[tid]);\r
- sd2[tid] = 0;\r
- sdabs2[tid] = 0;\r
- }\r
- else\r
- {\r
- sd1[tid] = 0;\r
- sdabs1[tid] = 0;\r
- sd2[tid] = sdx[tid];\r
- sdabs2[tid] = ::fabs(sdx[tid]);\r
- }\r
- __syncthreads();\r
+ if (tid < 25)\r
+ {\r
+ if (sdy[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdx[tid];\r
+ sdabs1[tid] = ::fabs(sdx[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdx[tid];\r
+ sdabs2[tid] = ::fabs(sdx[tid]);\r
+ }\r
+ __syncthreads();\r
\r
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
- __syncthreads();\r
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
\r
- // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
- if (tid == 0)\r
- {\r
- descriptors_block[0] = sd1[0];\r
- descriptors_block[1] = sdabs1[0];\r
- descriptors_block[2] = sd2[0];\r
- descriptors_block[3] = sdabs2[0];\r
- }\r
- __syncthreads();\r
+ // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sd1[0];\r
+ descriptors_block[1] = sdabs1[0];\r
+ descriptors_block[2] = sd2[0];\r
+ descriptors_block[3] = sdabs2[0];\r
+ }\r
+ __syncthreads();\r
\r
- if (sdx[tid] >= 0)\r
- {\r
- sd1[tid] = sdy[tid];\r
- sdabs1[tid] = ::fabs(sdy[tid]);\r
- sd2[tid] = 0;\r
- sdabs2[tid] = 0;\r
- }\r
- else\r
- {\r
- sd1[tid] = 0;\r
- sdabs1[tid] = 0;\r
- sd2[tid] = sdy[tid];\r
- sdabs2[tid] = ::fabs(sdy[tid]);\r
- }\r
- __syncthreads();\r
+ if (sdx[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdy[tid];\r
+ sdabs1[tid] = ::fabs(sdy[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdy[tid];\r
+ sdabs2[tid] = ::fabs(sdy[tid]);\r
+ }\r
+ __syncthreads();\r
\r
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
- __syncthreads();\r
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
\r
- // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
- if (tid == 0)\r
- {\r
- descriptors_block[4] = sd1[0];\r
- descriptors_block[5] = sdabs1[0];\r
- descriptors_block[6] = sd2[0];\r
- descriptors_block[7] = sdabs2[0];\r
+ // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[4] = sd1[0];\r
+ descriptors_block[5] = sdabs1[0];\r
+ descriptors_block[6] = sd2[0];\r
+ descriptors_block[7] = sdabs2[0];\r
+ }\r
+ }\r
}\r
- }\r
-}\r
\r
-template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)\r
-{\r
- // no need for thread ID\r
- float* descriptor_base = descriptors.ptr(blockIdx.x);\r
-\r
- // read in the unnormalized descriptor values (squared)\r
- __shared__ float sqDesc[BLOCK_DIM_X];\r
- const float lookup = descriptor_base[threadIdx.x];\r
- sqDesc[threadIdx.x] = lookup * lookup;\r
- __syncthreads();\r
+ template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)\r
+ {\r
+ // no need for thread ID\r
+ float* descriptor_base = descriptors.ptr(blockIdx.x);\r
\r
- if (BLOCK_DIM_X >= 128)\r
- {\r
- if (threadIdx.x < 64)\r
- sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
- __syncthreads();\r
- }\r
+ // read in the unnormalized descriptor values (squared)\r
+ __shared__ float sqDesc[BLOCK_DIM_X];\r
+ const float lookup = descriptor_base[threadIdx.x];\r
+ sqDesc[threadIdx.x] = lookup * lookup;\r
+ __syncthreads();\r
\r
- // reduction to get total\r
- if (threadIdx.x < 32)\r
- {\r
- volatile float* smem = sqDesc;\r
-\r
- smem[threadIdx.x] += smem[threadIdx.x + 32];\r
- smem[threadIdx.x] += smem[threadIdx.x + 16];\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
- }\r
-\r
- // compute length (square root)\r
- __shared__ float len;\r
- if (threadIdx.x == 0)\r
- {\r
- len = sqrtf(sqDesc[0]);\r
- }\r
- __syncthreads();\r
+ if (BLOCK_DIM_X >= 128)\r
+ {\r
+ if (threadIdx.x < 64)\r
+ sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
+ __syncthreads();\r
+ }\r
\r
- // normalize and store in output\r
- descriptor_base[threadIdx.x] = lookup / len;\r
-}\r
+ // reduction to get total\r
+ if (threadIdx.x < 32)\r
+ {\r
+ volatile float* smem = sqDesc;\r
+\r
+ smem[threadIdx.x] += smem[threadIdx.x + 32];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 16];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 8];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 4];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 2];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+ }\r
\r
-void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
- const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)\r
-{\r
- // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
- \r
- if (descriptors.cols == 64)\r
- {\r
- compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
- cudaSafeCall( cudaGetLastError() );\r
+ // compute length (square root)\r
+ __shared__ float len;\r
+ if (threadIdx.x == 0)\r
+ {\r
+ len = sqrtf(sqDesc[0]);\r
+ }\r
+ __syncthreads();\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ // normalize and store in output\r
+ descriptor_base[threadIdx.x] = lookup / len;\r
+ }\r
\r
- normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
- cudaSafeCall( cudaGetLastError() );\r
+ void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
+ const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)\r
+ {\r
+ // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
+ \r
+ if (descriptors.cols == 64)\r
+ {\r
+ compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
- else\r
- {\r
- compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); \r
- cudaSafeCall( cudaGetLastError() );\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors); \r
- cudaSafeCall( cudaGetLastError() );\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ else\r
+ {\r
+ compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); \r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
- }\r
-}\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
\r
-} // namespace surf\r
+ normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors); \r
+ cudaSafeCall( cudaGetLastError() );\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ }\r
+ } // namespace surf\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#include "opencv2/gpu/stream_accessor.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);\r
-\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);\r
\r
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE;\r
+using namespace ::cv::gpu::device;\r
\r
struct Stream::Impl\r
{\r
////////////////////////////////////////////////////////////////////////\r
// add\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T, typename D> \r
-void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-template <typename T, typename D> \r
-void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T, typename D> \r
+ void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T, typename D> \r
+ void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
\r
void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
////////////////////////////////////////////////////////////////////////\r
// subtract\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T, typename D> \r
-void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-template <typename T, typename D> \r
-void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T, typename D> \r
+ void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T, typename D> \r
+ void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
\r
void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
\r
////////////////////////////////////////////////////////////////////////\r
// multiply\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
-void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
-\r
-template <typename T, typename D> \r
-void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
+ void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
\r
-template <typename T, typename D> \r
-void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template <typename T, typename D> \r
+ void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T, typename D> \r
+ void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
\r
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
////////////////////////////////////////////////////////////////////////\r
// divide\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
-void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
-\r
-template <typename T, typename D> \r
-void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
+ void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
\r
-template <typename T, typename D> \r
-void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+ template <typename T, typename D> \r
+ void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
-template <typename T, typename D> \r
-void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template <typename T, typename D> \r
+ void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T, typename D> \r
+ void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
\r
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
\r
\r
void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// absdiff\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T>\r
-void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-template <typename T> \r
-void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T>\r
+ void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T> \r
+ void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
\r
void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// Comparison of two matrixes\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+ template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// Unary bitwise logical operations\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T>\r
+ void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+}}}\r
\r
namespace\r
{\r
{\r
dst.create(src.size(), src.type());\r
\r
- OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);\r
+ ::cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);\r
}\r
\r
\r
void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// Binary bitwise logical operations\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
\r
-void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+ template <typename T>\r
+ void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
\r
-template <typename T>\r
-void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+ void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
\r
-void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+ template <typename T>\r
+ void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
\r
-template <typename T>\r
-void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+ void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T>\r
+ void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+}}}\r
\r
namespace\r
{\r
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
dst.create(src1.size(), src1.type());\r
\r
- OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
+ ::cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
}\r
\r
void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
dst.create(src1.size(), src1.type());\r
\r
- OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
+ ::cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
}\r
\r
\r
void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
dst.create(src1.size(), src1.type());\r
\r
- OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
+ ::cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
}\r
\r
\r
void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// Minimum and maximum operations\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T>\r
-void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
-\r
-template <typename T>\r
-void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T>\r
+ void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
\r
-template <typename T>\r
-void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+ template <typename T>\r
+ void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
\r
-template <typename T>\r
-void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+ template <typename T>\r
+ void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T>\r
+ void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+}}}\r
\r
namespace\r
{\r
{\r
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
dst.create(src1.size(), src1.type());\r
- OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
+ ::cv::gpu::device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
}\r
\r
template <typename T>\r
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
{\r
dst.create(src1.size(), src1.type());\r
- OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
+ ::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
}\r
\r
template <typename T>\r
{\r
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
dst.create(src1.size(), src1.type());\r
- OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
+ ::cv::gpu::device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
}\r
\r
template <typename T>\r
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
{\r
dst.create(src1.size(), src1.type());\r
- OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
+ ::cv::gpu::device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
}\r
}\r
\r
////////////////////////////////////////////////////////////////////////\r
// threshold\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T>\r
-void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T>\r
+ void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);\r
+}}}\r
\r
namespace\r
{\r
template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)\r
{\r
- OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);\r
+ ::cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);\r
}\r
}\r
\r
////////////////////////////////////////////////////////////////////////\r
// pow\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template<typename T>\r
-void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template<typename T>\r
+ void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
CV_Assert(src.depth() != CV_64F);\r
dst.create(src.size(), src.type());\r
////////////////////////////////////////////////////////////////////////\r
// addWeighted\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T1, typename T2, typename D>\r
-void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ template <typename T1, typename T2, typename D>\r
+ void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+}}}\r
\r
void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE;\r
+ using namespace ::cv::gpu::device;\r
\r
CV_Assert(src1.size() == src2.size());\r
CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));\r
////////////////////////////////////////////////////////////////////////////////////////////////////\r
// Separable Linear Filter\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace row_filter\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T, typename D>\r
- void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-}\r
-\r
-namespace column_filter\r
-{\r
- template <typename T, typename D>\r
- void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-}\r
+ namespace row_filter\r
+ {\r
+ template <typename T, typename D>\r
+ void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace column_filter\r
+ {\r
+ template <typename T, typename D>\r
+ void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
\r
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;\r
+ using namespace ::cv::gpu::device::row_filter;\r
\r
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};\r
\r
\r
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;\r
+ using namespace ::cv::gpu::device::column_filter;\r
\r
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};\r
\r
\r
#else\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace hog \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
- int nblocks_win_x, int nblocks_win_y);\r
-\r
- void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,\r
- int height, int width, const cv::gpu::DevMem2Df& grad, \r
- const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);\r
-\r
- void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
- int height, int width, float* block_hists, float threshold);\r
-\r
- void classify_hists(int win_height, int win_width, int block_stride_y, \r
- int block_stride_x, int win_stride_y, int win_stride_x, int height, \r
- int width, float* block_hists, float* coefs, float free_coef, \r
- float threshold, unsigned char* labels);\r
-\r
- void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
- cv::gpu::DevMem2Df descriptors);\r
- void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
- int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
- cv::gpu::DevMem2Df descriptors);\r
-\r
- void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
- float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
- void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
- float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
-\r
- void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
- void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace hog \r
+ {\r
+ void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
+ int nblocks_win_x, int nblocks_win_y);\r
+\r
+ void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,\r
+ int height, int width, const cv::gpu::DevMem2Df& grad, \r
+ const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);\r
+\r
+ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
+ int height, int width, float* block_hists, float threshold);\r
+\r
+ void classify_hists(int win_height, int win_width, int block_stride_y, \r
+ int block_stride_x, int win_stride_y, int win_stride_x, int height, \r
+ int width, float* block_hists, float* coefs, float free_coef, \r
+ float threshold, unsigned char* labels);\r
+\r
+ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ cv::gpu::DevMem2Df descriptors);\r
+ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+ int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+ cv::gpu::DevMem2Df descriptors);\r
+\r
+ void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
+ float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
+ void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
+ float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
+\r
+ void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
+ void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE;\r
+using namespace ::cv::gpu::device;\r
\r
cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, \r
int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)\r
////////////////////////////////////////////////////////////////////////\r
// remap\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T> \r
- void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, \r
- int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ template <typename T> \r
+ void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, \r
+ int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+ }\r
+}}}\r
\r
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
////////////////////////////////////////////////////////////////////////\r
// meanShiftFiltering_GPU\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
if( src.empty() )\r
CV_Error( CV_StsBadArg, "The input image is empty" );\r
////////////////////////////////////////////////////////////////////////\r
// meanShiftProc_GPU\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
if( src.empty() )\r
CV_Error( CV_StsBadArg, "The input image is empty" );\r
////////////////////////////////////////////////////////////////////////\r
// drawColorDisp\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
- void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
+ void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
template <typename T>\r
void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
dst.create(src.size(), CV_8UC4);\r
\r
////////////////////////////////////////////////////////////////////////\r
// reprojectImageTo3D\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
- void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
+ void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
template <typename T>\r
void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
xyzw.create(disp.rows, disp.cols, CV_32FC4);\r
\r
////////////////////////////////////////////////////////////////////////\r
// resize\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)\r
{\r
}\r
else\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
static const caller_t callers[6][4] = \r
////////////////////////////////////////////////////////////////////////\r
// copyMakeBorder\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// buildWarpPlaneMaps\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,\r
- cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,\r
+ cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, \r
float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
//////////////////////////////////////////////////////////////////////////////\r
// buildWarpCylyndricalMaps\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], float scale,\r
- cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], float scale,\r
+ cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
//////////////////////////////////////////////////////////////////////////////\r
// buildWarpSphericalMaps\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
- const float k_rinv[9], const float r_kinv[9], float scale,\r
- cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+ const float k_rinv[9], const float r_kinv[9], float scale,\r
+ cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
//////////////////////////////////////////////////////////////////////////////\r
// columnSum\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc\r
+ {\r
+ void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);\r
+ }\r
+}}}\r
\r
void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(src.type() == CV_32F);\r
\r
hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));\r
}\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace hist\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);\r
-\r
- const int PARTIAL_HISTOGRAM256_COUNT = 240;\r
- const int HISTOGRAM256_BIN_COUNT = 256;\r
+ namespace hist\r
+ {\r
+ void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);\r
\r
- void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);\r
-}\r
+ const int PARTIAL_HISTOGRAM256_COUNT = 240;\r
+ const int HISTOGRAM256_BIN_COUNT = 256;\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)\r
{\r
\r
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ hist;\r
+ using namespace ::cv::gpu::device::hist;\r
\r
CV_Assert(src.type() == CV_8UC1);\r
\r
\r
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ hist;\r
+ using namespace ::cv::gpu::device::hist;\r
\r
CV_Assert(src.type() == CV_8UC1);\r
\r
////////////////////////////////////////////////////////////////////////\r
// cornerHarris & minEgenVal\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);\r
- void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
- void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);\r
+ void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
+ void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
namespace \r
{\r
\r
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
borderType == cv::BORDER_REPLICATE);\r
\r
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)\r
{ \r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
borderType == cv::BORDER_REPLICATE);\r
//////////////////////////////////////////////////////////////////////////////\r
// mulSpectrums\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-\r
- void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-}\r
+ namespace imgproc \r
+ {\r
+ void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// mulAndScaleSpectrums\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-\r
- void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-}\r
+ namespace imgproc \r
+ {\r
+ void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };\r
convolve(image, templ, result, ccorr, buf);\r
}\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc\r
+ {\r
+ void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
#ifndef HAVE_CUFFT\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// pyrDown\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// pyrUp\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace imgproc \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace imgproc \r
+ {\r
+ template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+ using namespace ::cv::gpu::device::imgproc;\r
\r
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
\r
trackBuf2.release();\r
}\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace canny \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);\r
-\r
- void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);\r
- void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);\r
+ namespace canny \r
+ {\r
+ void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);\r
\r
- void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);\r
- \r
- void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);\r
+ void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);\r
+ void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);\r
\r
- void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);\r
+ void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);\r
+ \r
+ void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);\r
\r
- void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);\r
-}\r
+ void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
+ using namespace ::cv::gpu::device::canny;\r
\r
calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);\r
\r
\r
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
+ using namespace ::cv::gpu::device::canny;\r
\r
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));\r
CV_Assert(src.type() == CV_8UC1);\r
\r
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
+ using namespace ::cv::gpu::device::canny;\r
\r
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));\r
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());\r
////////////////////////////////////////////////////////////////////\r
// GpuFuncTable\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);\r
-\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
-template <typename T>\r
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);\r
\r
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
+ template <typename T>\r
+ void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+}}}\r
\r
namespace\r
{\r
\r
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)\r
{\r
- OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);\r
+ ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);\r
}\r
\r
//////////////////////////////////////////////////////////////////////////\r
void kernelSet(GpuMat& src, Scalar s)\r
{\r
Scalar_<T> sf = s;\r
- OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);\r
+ ::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), 0);\r
}\r
\r
template<int SDEPTH, int SCN> struct NppSetMaskFunc\r
void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)\r
{\r
Scalar_<T> sf = s;\r
- OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);\r
+ ::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), 0);\r
}\r
\r
class CudaFuncTable : public GpuFuncTable\r
\r
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const \r
{ \r
- OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());\r
+ ::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());\r
}\r
\r
void convert(const GpuMat& src, GpuMat& dst) const \r
\r
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const \r
{ \r
- device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);\r
+ ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);\r
}\r
\r
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const\r
\r
#else\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace match_template \r
+namespace cv { namespace gpu { namespace device \r
{\r
- void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
- void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
-\r
- void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
- void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
-\r
- void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
- int cn, cudaStream_t stream);\r
-\r
- void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
- int cn, cudaStream_t stream);\r
-\r
- void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_8UC2(\r
- int w, int h,\r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g, \r
- unsigned int templ_sum_r,\r
- unsigned int templ_sum_g, \r
- DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_8UC3(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g,\r
- const DevMem2D_<unsigned int> image_sum_b,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
- DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_8UC4(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, \r
- const DevMem2D_<unsigned int> image_sum_g,\r
- const DevMem2D_<unsigned int> image_sum_b,\r
- const DevMem2D_<unsigned int> image_sum_a,\r
- unsigned int templ_sum_r, \r
- unsigned int templ_sum_g, \r
- unsigned int templ_sum_b, \r
- unsigned int templ_sum_a, \r
- DevMem2Df result, cudaStream_t stream);\r
-\r
+ namespace match_template \r
+ {\r
+ void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
+ void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
\r
- void matchTemplatePrepared_CCOFF_NORMED_8U(\r
- int w, int h, const DevMem2D_<unsigned int> image_sum, \r
- const DevMem2D_<unsigned long long> image_sqsum,\r
- unsigned int templ_sum, unsigned int templ_sqsum,\r
- DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
- DevMem2Df result, cudaStream_t stream);\r
- void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
- int w, int h, \r
- const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
- const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
- const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
- const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,\r
- unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
- unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
- unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
- unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
- DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
+ void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
\r
- void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
- unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);\r
+ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
+ int cn, cudaStream_t stream);\r
\r
- void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);\r
-}\r
+ void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
+ int cn, cudaStream_t stream);\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_8UC2(\r
+ int w, int h,\r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g, \r
+ unsigned int templ_sum_r,\r
+ unsigned int templ_sum_g, \r
+ DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_8UC3(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b,\r
+ unsigned int templ_sum_r, \r
+ unsigned int templ_sum_g, \r
+ unsigned int templ_sum_b, \r
+ DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_8UC4(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, \r
+ const DevMem2D_<unsigned int> image_sum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b,\r
+ const DevMem2D_<unsigned int> image_sum_a,\r
+ unsigned int templ_sum_r, \r
+ unsigned int templ_sum_g, \r
+ unsigned int templ_sum_b, \r
+ unsigned int templ_sum_a, \r
+ DevMem2Df result, cudaStream_t stream);\r
+\r
+\r
+ void matchTemplatePrepared_CCOFF_NORMED_8U(\r
+ int w, int h, const DevMem2D_<unsigned int> image_sum, \r
+ const DevMem2D_<unsigned long long> image_sqsum,\r
+ unsigned int templ_sum, unsigned int templ_sqsum,\r
+ DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
+ DevMem2Df result, cudaStream_t stream);\r
+ void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
+ int w, int h, \r
+ const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
+ const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
+ const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,\r
+ const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,\r
+ unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
+ unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
+ unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
+ unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
+ DevMem2Df result, cudaStream_t stream);\r
+\r
+ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
+ unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);\r
+\r
+ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE_ match_template;\r
+using namespace ::cv::gpu::device::match_template;\r
\r
namespace \r
{\r
////////////////////////////////////////////////////////////////////////\r
// Sum\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace matrix_reductions \r
+namespace cv { namespace gpu { namespace device \r
{\r
- namespace sum\r
+ namespace matrix_reductions \r
{\r
- template <typename T>\r
- void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ namespace sum\r
+ {\r
+ template <typename T>\r
+ void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- template <typename T>\r
- void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ template <typename T>\r
+ void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- template <typename T>\r
- void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ template <typename T>\r
+ void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- template <typename T>\r
- void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ template <typename T>\r
+ void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- template <typename T>\r
- void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ template <typename T>\r
+ void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- template <typename T>\r
- void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+ template <typename T>\r
+ void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
\r
- void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);\r
+ void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);\r
+ }\r
}\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}}\r
\r
Scalar cv::gpu::sum(const GpuMat& src) \r
{\r
\r
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
+ using namespace ::cv::gpu::device::matrix_reductions::sum;\r
\r
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
\r
\r
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
+ using namespace ::cv::gpu::device::matrix_reductions::sum;\r
\r
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
\r
\r
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
+ using namespace ::cv::gpu::device::matrix_reductions::sum;\r
\r
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
\r
////////////////////////////////////////////////////////////////////////\r
// Find min or max\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace matrix_reductions \r
+namespace cv { namespace gpu { namespace device \r
{\r
- namespace minmax \r
+ namespace matrix_reductions \r
{\r
- void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);\r
- \r
- template <typename T> \r
- void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
+ namespace minmax \r
+ {\r
+ void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);\r
+ \r
+ template <typename T> \r
+ void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
\r
- template <typename T> \r
- void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+ template <typename T> \r
+ void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
\r
- template <typename T> \r
- void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
+ template <typename T> \r
+ void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
\r
- template <typename T> \r
- void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+ template <typename T> \r
+ void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+ }\r
}\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}}\r
\r
\r
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)\r
\r
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;\r
+ using namespace ::cv::gpu::device::matrix_reductions::minmax;\r
\r
typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);\r
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
////////////////////////////////////////////////////////////////////////\r
// Locate min and max\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace matrix_reductions \r
+namespace cv { namespace gpu { namespace device \r
{\r
- namespace minmaxloc \r
+ namespace matrix_reductions \r
{\r
- void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
- int& b1rows, int& b2cols, int& b2rows);\r
-\r
- template <typename T> \r
- void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+ namespace minmaxloc \r
+ {\r
+ void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
+ int& b1rows, int& b2cols, int& b2rows);\r
\r
- template <typename T> \r
- void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ template <typename T> \r
+ void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
\r
- template <typename T> \r
- void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+ template <typename T> \r
+ void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
\r
- template <typename T> \r
- void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ template <typename T> \r
+ void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
- }\r
-}\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T> \r
+ void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+ int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+ }\r
+ }\r
+}}}\r
\r
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)\r
{ \r
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,\r
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;\r
+ using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;\r
\r
typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
//////////////////////////////////////////////////////////////////////////////\r
// Count non-zero elements\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace matrix_reductions \r
+namespace cv { namespace gpu { namespace device \r
{\r
- namespace countnonzero \r
+ namespace matrix_reductions \r
{\r
- void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);\r
+ namespace countnonzero \r
+ {\r
+ void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);\r
\r
- template <typename T> \r
- int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);\r
+ template <typename T> \r
+ int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);\r
\r
- template <typename T> \r
- int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);\r
+ template <typename T> \r
+ int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);\r
+ }\r
}\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}}\r
\r
int cv::gpu::countNonZero(const GpuMat& src)\r
{\r
\r
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;\r
+ using namespace ::cv::gpu::device::matrix_reductions::countnonzero;\r
\r
typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);\r
\r
\r
//////////////////////////////////////////////////////////////////////////////\r
// reduce\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
\r
-namespace matrix_reductions \r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
- template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace matrix_reductions \r
+ {\r
+ template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;\r
+ using namespace ::cv::gpu::device::matrix_reductions;\r
\r
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);\r
CV_Assert(dim == 0 || dim == 1);\r
#include "vec_traits.hpp"\r
#include "vec_math.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-//////////////////////////////////////////////////////////////\r
-// BrdConstant\r
-\r
-template <typename D> struct BrdRowConstant\r
-{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}\r
-\r
- template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
- {\r
- return x >= 0 ? saturate_cast<D>(data[x]) : val;\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
- {\r
- return x < width ? saturate_cast<D>(data[x]) : val;\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
- {\r
- return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;\r
- }\r
-\r
- const int width;\r
- const D val;\r
-};\r
-\r
-template <typename D> struct BrdColConstant\r
-{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}\r
-\r
- template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
- {\r
- return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
- {\r
- return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
- {\r
- return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
- }\r
-\r
- const int height;\r
- const D val;\r
-};\r
-\r
-template <typename D> struct BrdConstant\r
-{\r
- typedef D result_type;\r
-\r
- __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) \r
- {\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const\r
- {\r
- return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;\r
- }\r
-\r
- template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const\r
- {\r
- return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
- }\r
-\r
- const int height;\r
- const int width;\r
- const D val;\r
-};\r
-\r
-//////////////////////////////////////////////////////////////\r
-// BrdReplicate\r
-\r
-template <typename D> struct BrdRowReplicate\r
-{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}\r
-\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return ::max(x, 0);\r
- }\r
-\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return ::min(x, last_col);\r
- }\r
-\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_low(idx_col_high(x));\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_low(x)]);\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_high(x)]);\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col(x)]);\r
- }\r
-\r
- const int last_col;\r
-};\r
-\r
-template <typename D> struct BrdColReplicate\r
-{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}\r
-\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return ::max(y, 0);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return ::min(y, last_row);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_low(idx_row_high(y));\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));\r
- }\r
-\r
- const int last_row;\r
-};\r
-\r
-template <typename D> struct BrdReplicate\r
-{\r
- typedef D result_type;\r
-\r
- __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
-\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return ::max(y, 0);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return ::min(y, last_row);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_low(idx_row_high(y));\r
- }\r
-\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return ::max(x, 0);\r
- }\r
-\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return ::min(x, last_col);\r
- }\r
-\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_low(idx_col_high(x));\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
- }\r
-\r
- template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
- {\r
- return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
- }\r
-\r
- const int last_row;\r
- const int last_col;\r
-};\r
-\r
-//////////////////////////////////////////////////////////////\r
-// BrdReflect101\r
-\r
-template <typename D> struct BrdRowReflect101\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}\r
+ //////////////////////////////////////////////////////////////\r
+ // BrdConstant\r
\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
+ template <typename D> struct BrdRowConstant\r
{\r
- return ::abs(x) % (last_col + 1);\r
- }\r
+ typedef D result_type;\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}\r
\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_low(idx_col_high(x));\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ {\r
+ return x >= 0 ? saturate_cast<D>(data[x]) : val;\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_low(x)]);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ {\r
+ return x < width ? saturate_cast<D>(data[x]) : val;\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_high(x)]);\r
- }\r
-\r
- template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col(x)]);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+ {\r
+ return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;\r
+ }\r
\r
- const int last_col;\r
-};\r
-\r
-template <typename D> struct BrdColReflect101\r
-{\r
- typedef D result_type;\r
+ const int width;\r
+ const D val;\r
+ };\r
\r
- explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}\r
-\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return ::abs(y) % (last_row + 1);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row(int y) const\r
+ template <typename D> struct BrdColConstant\r
{\r
- return idx_row_low(idx_row_high(y));\r
- }\r
+ typedef D result_type;\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ {\r
+ return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+ {\r
+ return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+ }\r
\r
- const int last_row;\r
-};\r
-\r
-template <typename D> struct BrdReflect101\r
-{\r
- typedef D result_type;\r
+ template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ {\r
+ return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+ }\r
\r
- __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+ const int height;\r
+ const D val;\r
+ };\r
\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
+ template <typename D> struct BrdConstant\r
{\r
- return ::abs(y) % (last_row + 1);\r
- }\r
+ typedef D result_type;\r
\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
- }\r
+ __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) \r
+ {\r
+ }\r
\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_low(idx_row_high(y));\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const\r
+ {\r
+ return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;\r
+ }\r
\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return ::abs(x) % (last_col + 1);\r
- }\r
+ template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const\r
+ {\r
+ return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
+ }\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
- }\r
+ const int height;\r
+ const int width;\r
+ const D val;\r
+ };\r
\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_low(idx_col_high(x));\r
- }\r
+ //////////////////////////////////////////////////////////////\r
+ // BrdReplicate\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+ template <typename D> struct BrdRowReplicate\r
{\r
- return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
- }\r
+ typedef D result_type;\r
\r
- template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
- {\r
- return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}\r
\r
- const int last_row;\r
- const int last_col;\r
-};\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return ::max(x, 0);\r
+ }\r
\r
-//////////////////////////////////////////////////////////////\r
-// BrdReflect\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return ::min(x, last_col);\r
+ }\r
\r
-template <typename D> struct BrdRowReflect\r
-{\r
- typedef D result_type;\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
\r
- explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}\r
+ template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_low(x)]);\r
+ }\r
\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return (::abs(x) - (x < 0)) % (last_col + 1);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_high(x)]);\r
+ }\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col(x)]);\r
+ }\r
\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_low(idx_col_high(x));\r
- }\r
+ const int last_col;\r
+ };\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ template <typename D> struct BrdColReplicate\r
{\r
- return saturate_cast<D>(data[idx_col_low(x)]);\r
- }\r
+ typedef D result_type;\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_high(x)]);\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}\r
\r
- template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col(x)]);\r
- }\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return ::max(y, 0);\r
+ }\r
\r
- const int last_col;\r
-};\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return ::min(y, last_row);\r
+ }\r
\r
-template <typename D> struct BrdColReflect\r
-{\r
- typedef D result_type;\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
\r
- explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}\r
+ template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return (::abs(y) - (y < 0)) % (last_row + 1);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_low(idx_row_high(y));\r
- }\r
+ const int last_row;\r
+ };\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ template <typename D> struct BrdReplicate\r
{\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
- }\r
+ typedef D result_type;\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
- }\r
+ __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
- }\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return ::max(y, 0);\r
+ }\r
\r
- const int last_row;\r
-};\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return ::min(y, last_row);\r
+ }\r
\r
-template <typename D> struct BrdReflect\r
-{\r
- typedef D result_type;\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
\r
- __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return ::max(x, 0);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return ::min(x, last_col);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return (::abs(y) - (y < 0)) % (last_row + 1);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;\r
- }\r
+ template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+ {\r
+ return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_low(idx_row_high(y));\r
- }\r
-\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return (::abs(x) - (x < 0)) % (last_col + 1);\r
- }\r
+ const int last_row;\r
+ const int last_col;\r
+ };\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return /*::abs*/(last_col - ::abs(last_col - x) + (x > last_col)) /*% (last_col + 1)*/;\r
- }\r
+ //////////////////////////////////////////////////////////////\r
+ // BrdReflect101\r
\r
- __device__ __forceinline__ int idx_col(int x) const\r
+ template <typename D> struct BrdRowReflect101\r
{\r
- return idx_col_low(idx_col_high(x));\r
- }\r
+ typedef D result_type;\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}\r
\r
- template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
- {\r
- return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
- }\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return ::abs(x) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_low(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_high(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col(x)]);\r
+ }\r
\r
- const int last_row;\r
- const int last_col;\r
-};\r
+ const int last_col;\r
+ };\r
\r
-//////////////////////////////////////////////////////////////\r
-// BrdWrap\r
-\r
-template <typename D> struct BrdRowWrap\r
-{\r
- typedef D result_type;\r
-\r
- explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}\r
-\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
+ template <typename D> struct BrdColReflect101\r
{\r
- return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
- }\r
+ typedef D result_type;\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return (x < width) * x + (x >= width) * (x % width);\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}\r
\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_high(idx_col_low(x));\r
- }\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return ::abs(y) % (last_row + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col_low(x)]);\r
- }\r
+ const int last_row;\r
+ };\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ template <typename D> struct BrdReflect101\r
{\r
- return saturate_cast<D>(data[idx_col_high(x)]);\r
- }\r
+ typedef D result_type;\r
\r
- template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
- {\r
- return saturate_cast<D>(data[idx_col(x)]);\r
- }\r
+ __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
\r
- const int width;\r
-};\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return ::abs(y) % (last_row + 1);\r
+ }\r
\r
-template <typename D> struct BrdColWrap\r
-{\r
- typedef D result_type;\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
+ }\r
\r
- explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}\r
- template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
- }\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return ::abs(x) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return (y < height) * y + (y >= height) * (y % height);\r
- }\r
+ template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+ }\r
\r
- __device__ __forceinline__ int idx_row(int y) const\r
- {\r
- return idx_row_high(idx_row_low(y));\r
- }\r
+ template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+ {\r
+ return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
- }\r
+ const int last_row;\r
+ const int last_col;\r
+ };\r
\r
- template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
- }\r
+ //////////////////////////////////////////////////////////////\r
+ // BrdReflect\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ template <typename D> struct BrdRowReflect\r
{\r
- return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
- }\r
+ typedef D result_type;\r
\r
- const int height;\r
-};\r
+ explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}\r
\r
-template <typename D> struct BrdWrap\r
-{\r
- typedef D result_type;\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return (::abs(x) - (x < 0)) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_low(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_high(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col(x)]);\r
+ }\r
\r
- __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : \r
- height(height_), width(width_) \r
- {\r
- }\r
- template <typename U> \r
- __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : \r
- height(height_), width(width_) \r
- {\r
- }\r
-\r
- __device__ __forceinline__ int idx_row_low(int y) const\r
- {\r
- return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
- }\r
-\r
- __device__ __forceinline__ int idx_row_high(int y) const \r
- {\r
- return (y < height) * y + (y >= height) * (y % height);\r
- }\r
+ const int last_col;\r
+ };\r
\r
- __device__ __forceinline__ int idx_row(int y) const\r
+ template <typename D> struct BrdColReflect\r
{\r
- return idx_row_high(idx_row_low(y));\r
- }\r
+ typedef D result_type;\r
\r
- __device__ __forceinline__ int idx_col_low(int x) const\r
- {\r
- return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
- }\r
+ explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}\r
\r
- __device__ __forceinline__ int idx_col_high(int x) const \r
- {\r
- return (x < width) * x + (x >= width) * (x % width);\r
- }\r
-\r
- __device__ __forceinline__ int idx_col(int x) const\r
- {\r
- return idx_col_high(idx_col_low(x));\r
- }\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return (::abs(y) - (y < 0)) % (last_row + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+ }\r
\r
- template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
- {\r
- return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
- }\r
+ const int last_row;\r
+ };\r
\r
- template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+ template <typename D> struct BrdReflect\r
{\r
- return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
- }\r
-\r
- const int height;\r
- const int width;\r
-};\r
-\r
-//////////////////////////////////////////////////////////////\r
-// BorderReader\r
-\r
-template <typename Ptr2D, typename B> struct BorderReader\r
-{\r
- typedef typename B::result_type elem_type;\r
- typedef typename Ptr2D::index_type index_type;\r
+ typedef D result_type;\r
\r
- __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}\r
+ __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
\r
- __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const\r
- {\r
- return b.at(y, x, ptr);\r
- }\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return (::abs(y) - (y < 0)) % (last_row + 1);\r
+ }\r
\r
- const Ptr2D ptr;\r
- const B b;\r
-};\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;\r
+ }\r
\r
-// under win32 there is some bug with templated types that passed as kernel parameters\r
-// with this specialization all works fine\r
-template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >\r
-{\r
- typedef typename BrdConstant<D>::result_type elem_type;\r
- typedef typename Ptr2D::index_type index_type;\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_low(idx_row_high(y));\r
+ }\r
\r
- __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : \r
- src(src_), height(b.height), width(b.width), val(b.val) \r
- {\r
- }\r
-\r
- __device__ __forceinline__ D operator ()(index_type y, index_type x) const\r
- {\r
- return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
- }\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return (::abs(x) - (x < 0)) % (last_col + 1);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return (last_col - ::abs(last_col - x) + (x > last_col));\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_low(idx_col_high(x));\r
+ }\r
\r
- const Ptr2D src;\r
- const int height;\r
- const int width;\r
- const D val;\r
-};\r
+ template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+ {\r
+ return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+ }\r
+\r
+ const int last_row;\r
+ const int last_col;\r
+ };\r
+\r
+ //////////////////////////////////////////////////////////////\r
+ // BrdWrap\r
+\r
+ template <typename D> struct BrdRowWrap\r
+ {\r
+ typedef D result_type;\r
+\r
+ explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}\r
+\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return (x < width) * x + (x >= width) * (x % width);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_high(idx_col_low(x));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_low(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col_high(x)]);\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+ {\r
+ return saturate_cast<D>(data[idx_col(x)]);\r
+ }\r
+\r
+ const int width;\r
+ };\r
+\r
+ template <typename D> struct BrdColWrap\r
+ {\r
+ typedef D result_type;\r
+\r
+ explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}\r
+ template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}\r
+\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return (y < height) * y + (y >= height) * (y % height);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_high(idx_row_low(y));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+ }\r
+\r
+ const int height;\r
+ };\r
+\r
+ template <typename D> struct BrdWrap\r
+ {\r
+ typedef D result_type;\r
+\r
+ __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : \r
+ height(height_), width(width_) \r
+ {\r
+ }\r
+ template <typename U> \r
+ __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : \r
+ height(height_), width(width_) \r
+ {\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row_low(int y) const\r
+ {\r
+ return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row_high(int y) const \r
+ {\r
+ return (y < height) * y + (y >= height) * (y % height);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_row(int y) const\r
+ {\r
+ return idx_row_high(idx_row_low(y));\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_low(int x) const\r
+ {\r
+ return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col_high(int x) const \r
+ {\r
+ return (x < width) * x + (x >= width) * (x % width);\r
+ }\r
+\r
+ __device__ __forceinline__ int idx_col(int x) const\r
+ {\r
+ return idx_col_high(idx_col_low(x));\r
+ }\r
+\r
+ template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+ {\r
+ return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+ }\r
+\r
+ template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+ {\r
+ return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+ }\r
+\r
+ const int height;\r
+ const int width;\r
+ };\r
+\r
+ //////////////////////////////////////////////////////////////\r
+ // BorderReader\r
+\r
+ template <typename Ptr2D, typename B> struct BorderReader\r
+ {\r
+ typedef typename B::result_type elem_type;\r
+ typedef typename Ptr2D::index_type index_type;\r
+\r
+ __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}\r
+\r
+ __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const\r
+ {\r
+ return b.at(y, x, ptr);\r
+ }\r
+\r
+ const Ptr2D ptr;\r
+ const B b;\r
+ };\r
+\r
+ // under win32 there is some bug with templated types that passed as kernel parameters\r
+ // with this specialization all works fine\r
+ template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >\r
+ {\r
+ typedef typename BrdConstant<D>::result_type elem_type;\r
+ typedef typename Ptr2D::index_type index_type;\r
+\r
+ __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : \r
+ src(src_), height(b.height), width(b.width), val(b.val) \r
+ {\r
+ }\r
+\r
+ __device__ __forceinline__ D operator ()(index_type y, index_type x) const\r
+ {\r
+ return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
+ }\r
+\r
+ const Ptr2D src;\r
+ const int height;\r
+ const int width;\r
+ const D val;\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
#include "internal_shared.hpp"\r
#include "detail/color_detail.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements\r
-// template <typename T> class ColorSpace1_to_ColorSpace2_traits\r
-// {\r
-// typedef ... functor_type;\r
-// static __host__ __device__ functor_type create_functor();\r
-// };\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)\r
-OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)\r
-OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)\r
-OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS\r
-\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)\r
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements\r
+ // template <typename T> class ColorSpace1_to_ColorSpace2_traits\r
+ // {\r
+ // typedef ... functor_type;\r
+ // static __host__ __device__ functor_type create_functor();\r
+ // };\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)\r
+ OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)\r
+ OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)\r
+ OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS\r
+\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)\r
+ OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#if defined(_WIN64) || defined(__LP64__) \r
- // 64-bit register modifier for inlined asm\r
- #define OPENCV_GPU_ASM_PTR "l"\r
-#else \r
- // 32-bit register modifier for inlined asm\r
- #define OPENCV_GPU_ASM_PTR "r"\r
-#endif\r
-\r
+namespace cv { namespace gpu { namespace device \r
+{\r
#if __CUDA_ARCH__ >= 200\r
\r
// for Fermi memory space is detected automatically\r
\r
#else // __CUDA_ARCH__ >= 200 \r
\r
+ #if defined(_WIN64) || defined(__LP64__) \r
+ // 64-bit register modifier for inlined asm\r
+ #define OPENCV_GPU_ASM_PTR "l"\r
+ #else \r
+ // 32-bit register modifier for inlined asm\r
+ #define OPENCV_GPU_ASM_PTR "r"\r
+ #endif\r
+\r
template<class T> struct ForceGlob;\r
\r
#define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \\r
} \\r
};\r
\r
- OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)\r
- OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) \r
- OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) \r
- OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) \r
+ OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)\r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) \r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) \r
+ OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) \r
\r
- #undef OPENCV_GPU_DEFINE_FORCE_GLOB\r
- #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B\r
+ #undef OPENCV_GPU_DEFINE_FORCE_GLOB\r
+ #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B\r
+ #undef OPENCV_GPU_ASM_PTR\r
\r
#endif // __CUDA_ARCH__ >= 200\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__\r
#include "../limits.hpp"\r
#include "../functional.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#ifndef CV_DESCALE\r
- #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))\r
-#endif\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device\r
{\r
- template<typename T> struct ColorChannel\r
- {\r
- typedef float worktype_f;\r
- static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }\r
- static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }\r
- };\r
- template<> struct ColorChannel<float>\r
- {\r
- typedef float worktype_f;\r
- static __device__ __forceinline__ float max() { return 1.f; }\r
- static __device__ __forceinline__ float half() { return 0.5f; }\r
- };\r
+ #ifndef CV_DESCALE\r
+ #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))\r
+ #endif\r
\r
- template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)\r
- {\r
- }\r
- template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)\r
- {\r
- vec.w = val;\r
- }\r
- template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)\r
- {\r
- return ColorChannel<T>::max();\r
- }\r
- template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)\r
+ namespace color_detail\r
{\r
- return vec.w;\r
- }\r
+ template<typename T> struct ColorChannel\r
+ {\r
+ typedef float worktype_f;\r
+ static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }\r
+ static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }\r
+ };\r
+ template<> struct ColorChannel<float>\r
+ {\r
+ typedef float worktype_f;\r
+ static __device__ __forceinline__ float max() { return 1.f; }\r
+ static __device__ __forceinline__ float half() { return 0.5f; }\r
+ };\r
\r
- enum\r
- {\r
- yuv_shift = 14,\r
- xyz_shift = 12,\r
- R2Y = 4899,\r
- G2Y = 9617,\r
- B2Y = 1868,\r
- BLOCK_SIZE = 256\r
- };\r
-}\r
+ template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)\r
+ {\r
+ }\r
+ template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)\r
+ {\r
+ vec.w = val;\r
+ }\r
+ template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)\r
+ {\r
+ return ColorChannel<T>::max();\r
+ }\r
+ template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)\r
+ {\r
+ return vec.w;\r
+ }\r
+\r
+ enum\r
+ {\r
+ yuv_shift = 14,\r
+ xyz_shift = 12,\r
+ R2Y = 4899,\r
+ G2Y = 9617,\r
+ B2Y = 1868,\r
+ BLOCK_SIZE = 256\r
+ };\r
+ }\r
\r
////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////\r
\r
-namespace detail\r
-{\r
- template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ namespace color_detail\r
{\r
- __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
\r
- dst.x = (&src.x)[bidx];\r
- dst.y = src.y;\r
- dst.z = (&src.x)[bidx^2];\r
- setAlpha(dst, getAlpha<T>(src));\r
+ dst.x = (&src.x)[bidx];\r
+ dst.y = src.y;\r
+ dst.z = (&src.x)[bidx^2];\r
+ setAlpha(dst, getAlpha<T>(src));\r
\r
- return dst;\r
- }\r
- };\r
+ return dst;\r
+ }\r
+ };\r
\r
- template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>\r
- {\r
- __device__ uint operator()(uint src) const\r
+ template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>\r
{\r
- uint dst = 0;\r
-\r
- dst |= (0xffu & (src >> 16));\r
- dst |= (0xffu & (src >> 8)) << 8;\r
- dst |= (0xffu & (src)) << 16;\r
- dst |= (0xffu & (src >> 24)) << 24;\r
-\r
- return dst;\r
- }\r
- };\r
-}\r
+ __device__ uint operator()(uint src) const\r
+ {\r
+ uint dst = 0;\r
+\r
+ dst |= (0xffu & (src >> 16));\r
+ dst |= (0xffu & (src >> 8)) << 8;\r
+ dst |= (0xffu & (src)) << 16;\r
+ dst |= (0xffu & (src >> 24)) << 24;\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////\r
\r
-namespace detail\r
-{\r
- template <int green_bits, int bidx> struct RGB2RGB5x5Converter;\r
- template<int bidx> struct RGB2RGB5x5Converter<6, bidx> \r
- {\r
- static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
- {\r
- return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));\r
- }\r
- static __device__ __forceinline__ ushort cvt(uint src)\r
- {\r
- uint b = 0xffu & (src >> (bidx * 8));\r
- uint g = 0xffu & (src >> 8);\r
- uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
- return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));\r
- }\r
- };\r
- template<int bidx> struct RGB2RGB5x5Converter<5, bidx> \r
+ namespace color_detail\r
{\r
- static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
+ template <int green_bits, int bidx> struct RGB2RGB5x5Converter;\r
+ template<int bidx> struct RGB2RGB5x5Converter<6, bidx> \r
{\r
- return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));\r
- }\r
- static __device__ __forceinline__ ushort cvt(uint src)\r
+ static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
+ {\r
+ return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));\r
+ }\r
+ static __device__ __forceinline__ ushort cvt(uint src)\r
+ {\r
+ uint b = 0xffu & (src >> (bidx * 8));\r
+ uint g = 0xffu & (src >> 8);\r
+ uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+ return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));\r
+ }\r
+ };\r
+ template<int bidx> struct RGB2RGB5x5Converter<5, bidx> \r
{\r
- uint b = 0xffu & (src >> (bidx * 8));\r
- uint g = 0xffu & (src >> 8);\r
- uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
- uint a = 0xffu & (src >> 24);\r
- return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));\r
- }\r
- };\r
-\r
- template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;\r
- template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>\r
- {\r
- __device__ __forceinline__ ushort operator()(const uchar3& src) const\r
+ static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
+ {\r
+ return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));\r
+ }\r
+ static __device__ __forceinline__ ushort cvt(uint src)\r
+ {\r
+ uint b = 0xffu & (src >> (bidx * 8));\r
+ uint g = 0xffu & (src >> 8);\r
+ uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+ uint a = 0xffu & (src >> 24);\r
+ return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));\r
+ }\r
+ };\r
+\r
+ template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;\r
+ template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>\r
{\r
- return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
- }\r
- };\r
- template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>\r
- {\r
- __device__ __forceinline__ ushort operator()(uint src) const\r
+ __device__ __forceinline__ ushort operator()(const uchar3& src) const\r
+ {\r
+ return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
+ }\r
+ };\r
+ template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>\r
{\r
- return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ ushort operator()(uint src) const\r
+ {\r
+ return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \\r
struct name ## _traits \\r
{ \\r
- typedef detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- template <int green_bits, int bidx> struct RGB5x52RGBConverter; \r
- template <int bidx> struct RGB5x52RGBConverter<5, bidx>\r
+ namespace color_detail\r
{\r
- static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
- { \r
- (&dst.x)[bidx] = src << 3;\r
- dst.y = (src >> 2) & ~7;\r
- (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;\r
- }\r
- static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
- { \r
- dst = 0;\r
-\r
- dst |= (0xffu & (src << 3)) << (bidx * 8);\r
- dst |= (0xffu & ((src >> 2) & ~7)) << 8;\r
- dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);\r
- dst |= ((src & 0x8000) * 0xffu) << 24;\r
- }\r
- };\r
- template <int bidx> struct RGB5x52RGBConverter<6, bidx>\r
- {\r
- static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
- { \r
- (&dst.x)[bidx] = src << 3;\r
- dst.y = (src >> 3) & ~3;\r
- (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;\r
- }\r
- static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
- { \r
- dst = 0xffu << 24;\r
-\r
- dst |= (0xffu & (src << 3)) << (bidx * 8);\r
- dst |= (0xffu &((src >> 3) & ~3)) << 8;\r
- dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);\r
- }\r
- };\r
-\r
- template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;\r
- template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>\r
- {\r
- __device__ __forceinline__ uchar3 operator()(ushort src) const\r
+ template <int green_bits, int bidx> struct RGB5x52RGBConverter; \r
+ template <int bidx> struct RGB5x52RGBConverter<5, bidx>\r
{\r
- uchar3 dst;\r
- RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
- return dst;\r
- }\r
- };\r
- template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(ushort src) const\r
+ static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
+ { \r
+ (&dst.x)[bidx] = src << 3;\r
+ dst.y = (src >> 2) & ~7;\r
+ (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;\r
+ }\r
+ static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
+ { \r
+ dst = 0;\r
+\r
+ dst |= (0xffu & (src << 3)) << (bidx * 8);\r
+ dst |= (0xffu & ((src >> 2) & ~7)) << 8;\r
+ dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);\r
+ dst |= ((src & 0x8000) * 0xffu) << 24;\r
+ }\r
+ };\r
+ template <int bidx> struct RGB5x52RGBConverter<6, bidx>\r
{\r
- uint dst;\r
- RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
- return dst;\r
- }\r
- };\r
-}\r
+ static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
+ { \r
+ (&dst.x)[bidx] = src << 3;\r
+ dst.y = (src >> 3) & ~3;\r
+ (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;\r
+ }\r
+ static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
+ { \r
+ dst = 0xffu << 24;\r
+\r
+ dst |= (0xffu & (src << 3)) << (bidx * 8);\r
+ dst |= (0xffu &((src >> 3) & ~3)) << 8;\r
+ dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);\r
+ }\r
+ };\r
+\r
+ template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;\r
+ template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>\r
+ {\r
+ __device__ __forceinline__ uchar3 operator()(ushort src) const\r
+ {\r
+ uchar3 dst;\r
+ RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator()(ushort src) const\r
+ {\r
+ uint dst;\r
+ RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
+ return dst;\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \\r
struct name ## _traits \\r
{ \\r
- typedef detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
///////////////////////////////// Grayscale to Color ////////////////////////////////\r
\r
-namespace detail\r
-{\r
- template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>\r
+ namespace color_detail\r
{\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const\r
+ template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
\r
- dst.z = dst.y = dst.x = src; \r
- setAlpha(dst, ColorChannel<T>::max());\r
+ dst.z = dst.y = dst.x = src; \r
+ setAlpha(dst, ColorChannel<T>::max());\r
\r
- return dst;\r
- }\r
- };\r
- template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+ return dst;\r
+ }\r
+ };\r
+ template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>\r
{\r
- uint dst = 0xffu << 24;\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ uint dst = 0xffu << 24;\r
\r
- dst |= src;\r
- dst |= src << 8;\r
- dst |= src << 16;\r
+ dst |= src;\r
+ dst |= src << 8;\r
+ dst |= src << 16;\r
\r
- return dst;\r
- }\r
- };\r
-}\r
+ return dst;\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::Gray2RGB<T, dcn> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::Gray2RGB<T, dcn> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- template <int green_bits> struct Gray2RGB5x5Converter;\r
- template<> struct Gray2RGB5x5Converter<6> \r
+ namespace color_detail\r
{\r
- static __device__ __forceinline__ ushort cvt(uint t)\r
+ template <int green_bits> struct Gray2RGB5x5Converter;\r
+ template<> struct Gray2RGB5x5Converter<6> \r
{\r
- return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
- }\r
- };\r
- template<> struct Gray2RGB5x5Converter<5> \r
- {\r
- static __device__ __forceinline__ ushort cvt(uint t)\r
+ static __device__ __forceinline__ ushort cvt(uint t)\r
+ {\r
+ return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
+ }\r
+ };\r
+ template<> struct Gray2RGB5x5Converter<5> \r
{\r
- t >>= 3;\r
- return (ushort)(t | (t << 5) | (t << 10));\r
- }\r
- };\r
-\r
- template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>\r
- {\r
- __device__ __forceinline__ ushort operator()(uint src) const\r
+ static __device__ __forceinline__ ushort cvt(uint t)\r
+ {\r
+ t >>= 3;\r
+ return (ushort)(t | (t << 5) | (t << 10));\r
+ }\r
+ };\r
+\r
+ template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>\r
{\r
- return Gray2RGB5x5Converter<green_bits>::cvt(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ ushort operator()(uint src) const\r
+ {\r
+ return Gray2RGB5x5Converter<green_bits>::cvt(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \\r
struct name ## _traits \\r
{ \\r
- typedef detail::Gray2RGB5x5<green_bits> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
///////////////////////////////// Color to Grayscale ////////////////////////////////\r
\r
-namespace detail\r
-{\r
- template <int green_bits> struct RGB5x52GrayConverter;\r
- template <> struct RGB5x52GrayConverter<6> \r
+ namespace color_detail\r
{\r
- static __device__ __forceinline__ uchar cvt(uint t)\r
+ template <int green_bits> struct RGB5x52GrayConverter;\r
+ template <> struct RGB5x52GrayConverter<6> \r
{\r
- return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
- }\r
- };\r
- template <> struct RGB5x52GrayConverter<5> \r
- {\r
- static __device__ __forceinline__ uchar cvt(uint t)\r
+ static __device__ __forceinline__ uchar cvt(uint t)\r
+ {\r
+ return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
+ }\r
+ };\r
+ template <> struct RGB5x52GrayConverter<5> \r
{\r
- return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
- }\r
- }; \r
+ static __device__ __forceinline__ uchar cvt(uint t)\r
+ {\r
+ return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
+ }\r
+ }; \r
\r
- template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>\r
- {\r
- __device__ __forceinline__ uchar operator()(uint src) const\r
+ template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>\r
{\r
- return RGB5x52GrayConverter<green_bits>::cvt(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ uchar operator()(uint src) const\r
+ {\r
+ return RGB5x52GrayConverter<green_bits>::cvt(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \\r
struct name ## _traits \\r
{ \\r
- typedef detail::RGB5x52Gray<green_bits> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB5x52Gray<green_bits> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)\r
- {\r
- return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);\r
- }\r
- template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)\r
+ namespace color_detail\r
{\r
- uint b = 0xffu & (src >> (bidx * 8));\r
- uint g = 0xffu & (src >> 8);\r
- uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
- return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);\r
- }\r
- template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)\r
- {\r
- return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;\r
- }\r
-\r
- template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>\r
- {\r
- __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)\r
{\r
- return RGB2GrayConvert<bidx>(&src.x);\r
+ return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);\r
}\r
- };\r
- template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>\r
- {\r
- __device__ __forceinline__ uchar operator()(uint src) const\r
+ template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)\r
{\r
- return RGB2GrayConvert<bidx>(src);\r
+ uint b = 0xffu & (src >> (bidx * 8));\r
+ uint g = 0xffu & (src >> 8);\r
+ uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+ return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);\r
}\r
- };\r
-}\r
+ template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)\r
+ {\r
+ return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;\r
+ }\r
+\r
+ template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>\r
+ {\r
+ __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ return RGB2GrayConvert<bidx>(&src.x);\r
+ }\r
+ };\r
+ template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>\r
+ {\r
+ __device__ __forceinline__ uchar operator()(uint src) const\r
+ {\r
+ return RGB2GrayConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2Gray<T, scn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
///////////////////////////////////// RGB <-> YUV //////////////////////////////////////\r
\r
-namespace detail\r
-{\r
- __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\r
- __constant__ int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };\r
-\r
- template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)\r
+ namespace color_detail\r
{\r
- const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
+ __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\r
+ __constant__ int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };\r
\r
- const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
- const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
- const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
+ template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)\r
+ {\r
+ const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
\r
- dst.x = saturate_cast<T>(Y);\r
- dst.y = saturate_cast<T>(Cr);\r
- dst.z = saturate_cast<T>(Cb);\r
- }\r
- template <int bidx> static __device__ uint RGB2YUVConvert(uint src)\r
- {\r
- const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
+ const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
+ const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
+ const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
\r
- const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
- const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
- const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
+ dst.x = saturate_cast<T>(Y);\r
+ dst.y = saturate_cast<T>(Cr);\r
+ dst.z = saturate_cast<T>(Cb);\r
+ }\r
+ template <int bidx> static __device__ uint RGB2YUVConvert(uint src)\r
+ {\r
+ const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
\r
- uint dst = 0;\r
+ const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
+ const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
+ const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
\r
- dst |= saturate_cast<uchar>(Y);\r
- dst |= saturate_cast<uchar>(Cr) << 8;\r
- dst |= saturate_cast<uchar>(Cb) << 16;\r
+ uint dst = 0;\r
\r
- return dst;\r
- }\r
- template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)\r
- {\r
- dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];\r
- dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();\r
- dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();\r
- }\r
+ dst |= saturate_cast<uchar>(Y);\r
+ dst |= saturate_cast<uchar>(Cr) << 8;\r
+ dst |= saturate_cast<uchar>(Cb) << 16;\r
\r
- template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
- RGB2YUVConvert<bidx>(&src.x, dst);\r
return dst;\r
}\r
- };\r
- template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator ()(uint src) const\r
+ template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)\r
{\r
- return RGB2YUVConvert<bidx>(src);\r
+ dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];\r
+ dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();\r
+ dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+ RGB2YUVConvert<bidx>(&src.x, dst);\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator ()(uint src) const\r
+ {\r
+ return RGB2YUVConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };\r
- __constant__ int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; \r
-\r
- template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)\r
- {\r
- const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
- const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
- const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
-\r
- dst[bidx] = saturate_cast<D>(b);\r
- dst[1] = saturate_cast<D>(g);\r
- dst[bidx^2] = saturate_cast<D>(r);\r
- }\r
- template <int bidx> static __device__ uint YUV2RGBConvert(uint src)\r
+ namespace color_detail\r
{\r
- const int x = 0xff & (src);\r
- const int y = 0xff & (src >> 8);\r
- const int z = 0xff & (src >> 16);\r
- \r
- const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
- const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
- const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
-\r
- uint dst = 0xffu << 24;\r
-\r
- dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
- dst |= saturate_cast<uchar>(g) << 8;\r
- dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
+ __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };\r
+ __constant__ int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; \r
\r
- return dst;\r
- }\r
- template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)\r
- {\r
- dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];\r
- dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];\r
- dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];\r
- }\r
+ template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)\r
+ {\r
+ const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
+ const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
+ const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
\r
- template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ dst[bidx] = saturate_cast<D>(b);\r
+ dst[1] = saturate_cast<D>(g);\r
+ dst[bidx^2] = saturate_cast<D>(r);\r
+ }\r
+ template <int bidx> static __device__ uint YUV2RGBConvert(uint src)\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ const int x = 0xff & (src);\r
+ const int y = 0xff & (src >> 8);\r
+ const int z = 0xff & (src >> 16);\r
+ \r
+ const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
+ const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
+ const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
\r
- YUV2RGBConvert<bidx>(src, &dst.x);\r
- setAlpha(dst, ColorChannel<T>::max());\r
+ uint dst = 0xffu << 24;\r
+\r
+ dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
+ dst |= saturate_cast<uchar>(g) << 8;\r
+ dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
\r
return dst;\r
}\r
- };\r
- template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator ()(uint src) const\r
+ template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)\r
{\r
- return YUV2RGBConvert<bidx>(src);\r
+ dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];\r
+ dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];\r
+ dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ YUV2RGBConvert<bidx>(src, &dst.x);\r
+ setAlpha(dst, ColorChannel<T>::max());\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator ()(uint src) const\r
+ {\r
+ return YUV2RGBConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////\r
\r
-namespace detail\r
-{\r
- __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\r
- __constant__ int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\r
-\r
- template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)\r
+ namespace color_detail\r
{\r
- const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
+ __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\r
+ __constant__ int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\r
\r
- const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
- const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
- const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
+ template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)\r
+ {\r
+ const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
\r
- dst.x = saturate_cast<T>(Y);\r
- dst.y = saturate_cast<T>(Cr);\r
- dst.z = saturate_cast<T>(Cb);\r
- }\r
- template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)\r
- {\r
- const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
+ const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
+ const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
+ const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
\r
- const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
- const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
- const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
+ dst.x = saturate_cast<T>(Y);\r
+ dst.y = saturate_cast<T>(Cr);\r
+ dst.z = saturate_cast<T>(Cb);\r
+ }\r
+ template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)\r
+ {\r
+ const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
\r
- uint dst = 0;\r
+ const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
+ const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
+ const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
\r
- dst |= saturate_cast<uchar>(Y);\r
- dst |= saturate_cast<uchar>(Cr) << 8;\r
- dst |= saturate_cast<uchar>(Cb) << 16;\r
+ uint dst = 0;\r
\r
- return dst;\r
- }\r
- template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)\r
- {\r
- dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];\r
- dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();\r
- dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();\r
- }\r
+ dst |= saturate_cast<uchar>(Y);\r
+ dst |= saturate_cast<uchar>(Cr) << 8;\r
+ dst |= saturate_cast<uchar>(Cb) << 16;\r
\r
- template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
- RGB2YCrCbConvert<bidx>(&src.x, dst);\r
return dst;\r
}\r
- };\r
- template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator ()(uint src) const\r
+ template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)\r
{\r
- return RGB2YCrCbConvert<bidx>(src);\r
+ dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];\r
+ dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();\r
+ dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+ RGB2YCrCbConvert<bidx>(&src.x, dst);\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator ()(uint src) const\r
+ {\r
+ return RGB2YCrCbConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};\r
- __constant__ int c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};\r
-\r
- template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)\r
- {\r
- const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
- const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
- const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
-\r
- dst[bidx] = saturate_cast<D>(b);\r
- dst[1] = saturate_cast<D>(g);\r
- dst[bidx^2] = saturate_cast<D>(r);\r
- }\r
- template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)\r
+ namespace color_detail\r
{\r
- const int x = 0xff & (src);\r
- const int y = 0xff & (src >> 8);\r
- const int z = 0xff & (src >> 16);\r
- \r
- const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
- const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
- const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
-\r
- uint dst = 0xffu << 24;\r
-\r
- dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
- dst |= saturate_cast<uchar>(g) << 8;\r
- dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
+ __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};\r
+ __constant__ int c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};\r
\r
- return dst;\r
- }\r
- template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)\r
- {\r
- dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];\r
- dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];\r
- dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];\r
- }\r
+ template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)\r
+ {\r
+ const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
+ const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
+ const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
\r
- template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ dst[bidx] = saturate_cast<D>(b);\r
+ dst[1] = saturate_cast<D>(g);\r
+ dst[bidx^2] = saturate_cast<D>(r);\r
+ }\r
+ template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ const int x = 0xff & (src);\r
+ const int y = 0xff & (src >> 8);\r
+ const int z = 0xff & (src >> 16);\r
+ \r
+ const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
+ const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
+ const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
\r
- YCrCb2RGBConvert<bidx>(src, &dst.x);\r
- setAlpha(dst, ColorChannel<T>::max());\r
+ uint dst = 0xffu << 24;\r
+\r
+ dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
+ dst |= saturate_cast<uchar>(g) << 8;\r
+ dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
\r
return dst;\r
}\r
- };\r
- template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator ()(uint src) const\r
+ template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)\r
{\r
- return YCrCb2RGBConvert<bidx>(src);\r
+ dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];\r
+ dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];\r
+ dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ YCrCb2RGBConvert<bidx>(src, &dst.x);\r
+ setAlpha(dst, ColorChannel<T>::max());\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator ()(uint src) const\r
+ {\r
+ return YCrCb2RGBConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////\r
\r
-namespace detail\r
-{\r
- __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };\r
- __constant__ int c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };\r
-\r
- template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)\r
- {\r
- dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));\r
- dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));\r
- dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));\r
- }\r
- template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)\r
+ namespace color_detail\r
{\r
- const uint b = 0xffu & (src >> (bidx * 8));\r
- const uint g = 0xffu & (src >> 8);\r
- const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+ __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };\r
+ __constant__ int c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };\r
\r
- const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));\r
- const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));\r
- const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));\r
-\r
- uint dst = 0;\r
-\r
- dst |= x;\r
- dst |= y << 8;\r
- dst |= z << 16;\r
+ template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)\r
+ {\r
+ dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));\r
+ dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));\r
+ dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));\r
+ }\r
+ template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)\r
+ {\r
+ const uint b = 0xffu & (src >> (bidx * 8));\r
+ const uint g = 0xffu & (src >> 8);\r
+ const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
\r
- return dst;\r
- }\r
- template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)\r
- {\r
- dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];\r
- dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];\r
- dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];\r
- }\r
+ const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));\r
+ const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));\r
+ const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));\r
\r
- template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ uint dst = 0;\r
\r
- RGB2XYZConvert<bidx>(&src.x, dst);\r
+ dst |= x;\r
+ dst |= y << 8;\r
+ dst |= z << 16;\r
\r
return dst;\r
}\r
- };\r
- template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+ template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)\r
{\r
- return RGB2XYZConvert<bidx>(src);\r
+ dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];\r
+ dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];\r
+ dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ RGB2XYZConvert<bidx>(&src.x, dst);\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return RGB2XYZConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };\r
- __constant__ int c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };\r
-\r
- template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)\r
- {\r
- dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
- dst[1] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
- dst[bidx] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
- }\r
- template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)\r
+ namespace color_detail\r
{\r
- const int x = 0xff & src;\r
- const int y = 0xff & (src >> 8);\r
- const int z = 0xff & (src >> 16);\r
-\r
- const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
- const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
- const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
-\r
- uint dst = 0xffu << 24;\r
+ __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };\r
+ __constant__ int c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };\r
\r
- dst |= b << (bidx * 8);\r
- dst |= g << 8;\r
- dst |= r << ((bidx ^ 2) * 8);\r
+ template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)\r
+ {\r
+ dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
+ dst[1] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
+ dst[bidx] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
+ }\r
+ template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)\r
+ {\r
+ const int x = 0xff & src;\r
+ const int y = 0xff & (src >> 8);\r
+ const int z = 0xff & (src >> 16);\r
\r
- return dst;\r
- }\r
- template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)\r
- {\r
- dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];\r
- dst[1] = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];\r
- dst[bidx] = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];\r
- }\r
+ const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
+ const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
+ const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
\r
- template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ uint dst = 0xffu << 24;\r
\r
- XYZ2RGBConvert<bidx>(src, &dst.x);\r
- setAlpha(dst, ColorChannel<T>::max());\r
+ dst |= b << (bidx * 8);\r
+ dst |= g << 8;\r
+ dst |= r << ((bidx ^ 2) * 8);\r
\r
return dst;\r
}\r
- };\r
- template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+ template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)\r
{\r
- return XYZ2RGBConvert<bidx>(src);\r
+ dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];\r
+ dst[1] = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];\r
+ dst[bidx] = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];\r
}\r
- };\r
-}\r
+\r
+ template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+ {\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ XYZ2RGBConvert<bidx>(src, &dst.x);\r
+ setAlpha(dst, ColorChannel<T>::max());\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return XYZ2RGBConvert<bidx>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////\r
\r
-namespace detail\r
-{\r
- __constant__ int c_HsvDivTable [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};\r
- __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};\r
- __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};\r
-\r
- template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)\r
- {\r
- const int hsv_shift = 12;\r
- const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
-\r
- int b = src[bidx], g = src[1], r = src[bidx^2];\r
- int h, s, v = b;\r
- int vmin = b, diff;\r
- int vr, vg;\r
-\r
- v = ::max(v, g);\r
- v = ::max(v, r);\r
- vmin = ::min(vmin, g);\r
- vmin = ::min(vmin, r);\r
-\r
- diff = v - vmin;\r
- vr = (v == r) * -1;\r
- vg = (v == g) * -1;\r
-\r
- s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
- h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
- h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
- h += (h < 0) * hr;\r
-\r
- dst.x = saturate_cast<uchar>(h);\r
- dst.y = (uchar)s;\r
- dst.z = (uchar)v;\r
- }\r
- template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)\r
- {\r
- const int hsv_shift = 12;\r
- const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
-\r
- const int b = 0xff & (src >> (bidx * 8));\r
- const int g = 0xff & (src >> 8);\r
- const int r = 0xff & (src >> ((bidx ^ 2) * 8));\r
- \r
- int h, s, v = b;\r
- int vmin = b, diff;\r
- int vr, vg;\r
-\r
- v = ::max(v, g);\r
- v = ::max(v, r);\r
- vmin = ::min(vmin, g);\r
- vmin = ::min(vmin, r);\r
-\r
- diff = v - vmin;\r
- vr = (v == r) * -1;\r
- vg = (v == g) * -1;\r
-\r
- s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
- h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
- h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
- h += (h < 0) * hr;\r
-\r
- uint dst = 0;\r
-\r
- dst |= saturate_cast<uchar>(h);\r
- dst |= (0xffu & s) << 8;\r
- dst |= (0xffu & v) << 16;\r
-\r
- return dst;\r
- }\r
- template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)\r
+ namespace color_detail\r
{\r
- const float hscale = hr * (1.f / 360.f);\r
+ __constant__ int c_HsvDivTable [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};\r
+ __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};\r
+ __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};\r
\r
- float b = src[bidx], g = src[1], r = src[bidx^2];\r
- float h, s, v;\r
+ template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)\r
+ {\r
+ const int hsv_shift = 12;\r
+ const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
+\r
+ int b = src[bidx], g = src[1], r = src[bidx^2];\r
+ int h, s, v = b;\r
+ int vmin = b, diff;\r
+ int vr, vg;\r
+\r
+ v = ::max(v, g);\r
+ v = ::max(v, r);\r
+ vmin = ::min(vmin, g);\r
+ vmin = ::min(vmin, r);\r
+\r
+ diff = v - vmin;\r
+ vr = (v == r) * -1;\r
+ vg = (v == g) * -1;\r
+\r
+ s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+ h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
+ h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+ h += (h < 0) * hr;\r
+\r
+ dst.x = saturate_cast<uchar>(h);\r
+ dst.y = (uchar)s;\r
+ dst.z = (uchar)v;\r
+ }\r
+ template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)\r
+ {\r
+ const int hsv_shift = 12;\r
+ const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
+\r
+ const int b = 0xff & (src >> (bidx * 8));\r
+ const int g = 0xff & (src >> 8);\r
+ const int r = 0xff & (src >> ((bidx ^ 2) * 8));\r
+ \r
+ int h, s, v = b;\r
+ int vmin = b, diff;\r
+ int vr, vg;\r
+\r
+ v = ::max(v, g);\r
+ v = ::max(v, r);\r
+ vmin = ::min(vmin, g);\r
+ vmin = ::min(vmin, r);\r
+\r
+ diff = v - vmin;\r
+ vr = (v == r) * -1;\r
+ vg = (v == g) * -1;\r
+\r
+ s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+ h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
+ h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+ h += (h < 0) * hr;\r
\r
- float vmin, diff;\r
+ uint dst = 0;\r
\r
- v = vmin = r;\r
- v = fmax(v, g);\r
- v = fmax(v, b);\r
- vmin = fmin(vmin, g);\r
- vmin = fmin(vmin, b);\r
+ dst |= saturate_cast<uchar>(h);\r
+ dst |= (0xffu & s) << 8;\r
+ dst |= (0xffu & v) << 16;\r
\r
- diff = v - vmin;\r
- s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());\r
- diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));\r
+ return dst;\r
+ }\r
+ template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)\r
+ {\r
+ const float hscale = hr * (1.f / 360.f);\r
\r
- h = (v == r) * (g - b) * diff;\r
- h += (v != r && v == g) * ((b - r) * diff + 120.f);\r
- h += (v != r && v != g) * ((r - g) * diff + 240.f);\r
- h += (h < 0) * 360.f;\r
+ float b = src[bidx], g = src[1], r = src[bidx^2];\r
+ float h, s, v;\r
\r
- dst.x = h * hscale;\r
- dst.y = s;\r
- dst.z = v;\r
- }\r
+ float vmin, diff;\r
\r
- template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ v = vmin = r;\r
+ v = fmax(v, g);\r
+ v = fmax(v, b);\r
+ vmin = fmin(vmin, g);\r
+ vmin = fmin(vmin, b);\r
\r
- RGB2HSVConvert<bidx, hr>(&src.x, dst);\r
+ diff = v - vmin;\r
+ s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());\r
+ diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));\r
\r
- return dst;\r
+ h = (v == r) * (g - b) * diff;\r
+ h += (v != r && v == g) * ((b - r) * diff + 120.f);\r
+ h += (v != r && v != g) * ((r - g) * diff + 240.f);\r
+ h += (h < 0) * 360.f;\r
+\r
+ dst.x = h * hscale;\r
+ dst.y = s;\r
+ dst.z = v;\r
}\r
- };\r
- template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+\r
+ template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
{\r
- return RGB2HSVConvert<bidx, hr>(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ RGB2HSVConvert<bidx, hr>(&src.x, dst);\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return RGB2HSVConvert<bidx, hr>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <typename T> struct name ## _full_traits \\r
{ \\r
- typedef detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _traits<float> \\r
{ \\r
- typedef detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _full_traits<float> \\r
{ \\r
- typedef detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
-\r
- template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)\r
+ namespace color_detail\r
{\r
- const float hscale = 6.f / hr;\r
- \r
- float h = src.x, s = src.y, v = src.z;\r
- float b = v, g = v, r = v;\r
+ __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
\r
- if (s != 0)\r
+ template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)\r
{\r
- h *= hscale;\r
-\r
- if( h < 0 )\r
- do h += 6; while( h < 0 );\r
- else if( h >= 6 )\r
- do h -= 6; while( h >= 6 );\r
-\r
- int sector = __float2int_rd(h);\r
- h -= sector;\r
-\r
- float tab[4];\r
- tab[0] = v;\r
- tab[1] = v * (1.f - s);\r
- tab[2] = v * (1.f - s * h);\r
- tab[3] = v * (1.f - s * (1.f - h));\r
-\r
- b = tab[c_HsvSectorData[sector][0]];\r
- g = tab[c_HsvSectorData[sector][1]];\r
- r = tab[c_HsvSectorData[sector][2]];\r
+ const float hscale = 6.f / hr;\r
+ \r
+ float h = src.x, s = src.y, v = src.z;\r
+ float b = v, g = v, r = v;\r
+\r
+ if (s != 0)\r
+ {\r
+ h *= hscale;\r
+\r
+ if( h < 0 )\r
+ do h += 6; while( h < 0 );\r
+ else if( h >= 6 )\r
+ do h -= 6; while( h >= 6 );\r
+\r
+ int sector = __float2int_rd(h);\r
+ h -= sector;\r
+\r
+ float tab[4];\r
+ tab[0] = v;\r
+ tab[1] = v * (1.f - s);\r
+ tab[2] = v * (1.f - s * h);\r
+ tab[3] = v * (1.f - s * (1.f - h));\r
+\r
+ b = tab[c_HsvSectorData[sector][0]];\r
+ g = tab[c_HsvSectorData[sector][1]];\r
+ r = tab[c_HsvSectorData[sector][2]];\r
+ }\r
+\r
+ dst[bidx] = b;\r
+ dst[1] = g;\r
+ dst[bidx^2] = r;\r
}\r
+ template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)\r
+ {\r
+ float3 buf;\r
\r
- dst[bidx] = b;\r
- dst[1] = g;\r
- dst[bidx^2] = r;\r
- }\r
- template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)\r
- {\r
- float3 buf;\r
-\r
- buf.x = src.x;\r
- buf.y = src.y * (1.f / 255.f);\r
- buf.z = src.z * (1.f / 255.f);\r
+ buf.x = src.x;\r
+ buf.y = src.y * (1.f / 255.f);\r
+ buf.z = src.z * (1.f / 255.f);\r
\r
- HSV2RGBConvert<bidx, HR>(buf, &buf.x);\r
+ HSV2RGBConvert<bidx, HR>(buf, &buf.x);\r
\r
- dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
- dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
- dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
- }\r
- template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)\r
- {\r
- float3 buf;\r
+ dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
+ dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
+ dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+ }\r
+ template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)\r
+ {\r
+ float3 buf;\r
\r
- buf.x = src & 0xff;\r
- buf.y = ((src >> 8) & 0xff) * (1.f/255.f);\r
- buf.z = ((src >> 16) & 0xff) * (1.f/255.f);\r
+ buf.x = src & 0xff;\r
+ buf.y = ((src >> 8) & 0xff) * (1.f/255.f);\r
+ buf.z = ((src >> 16) & 0xff) * (1.f/255.f);\r
\r
- HSV2RGBConvert<bidx, hr>(buf, &buf.x);\r
+ HSV2RGBConvert<bidx, hr>(buf, &buf.x);\r
\r
- uint dst = 0xffu << 24;\r
+ uint dst = 0xffu << 24;\r
\r
- dst |= saturate_cast<uchar>(buf.x * 255.f);\r
- dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
- dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+ dst |= saturate_cast<uchar>(buf.x * 255.f);\r
+ dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+ dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
\r
- return dst;\r
- }\r
+ return dst;\r
+ }\r
\r
- template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
\r
- HSV2RGBConvert<bidx, hr>(src, &dst.x);\r
- setAlpha(dst, ColorChannel<T>::max());\r
+ HSV2RGBConvert<bidx, hr>(src, &dst.x);\r
+ setAlpha(dst, ColorChannel<T>::max());\r
\r
- return dst;\r
- }\r
- };\r
- template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
{\r
- return HSV2RGBConvert<bidx, hr>(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return HSV2RGBConvert<bidx, hr>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <typename T> struct name ## _full_traits \\r
{ \\r
- typedef detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _traits<float> \\r
{ \\r
- typedef detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _full_traits<float> \\r
{ \\r
- typedef detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
\r
/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////\r
\r
-namespace detail\r
-{\r
- template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)\r
+ namespace color_detail\r
{\r
- const float hscale = hr * (1.f / 360.f);\r
+ template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)\r
+ {\r
+ const float hscale = hr * (1.f / 360.f);\r
\r
- float b = src[bidx], g = src[1], r = src[bidx^2];\r
- float h = 0.f, s = 0.f, l;\r
- float vmin, vmax, diff;\r
+ float b = src[bidx], g = src[1], r = src[bidx^2];\r
+ float h = 0.f, s = 0.f, l;\r
+ float vmin, vmax, diff;\r
\r
- vmax = vmin = r;\r
- vmax = fmax(vmax, g);\r
- vmax = fmax(vmax, b);\r
- vmin = fmin(vmin, g);\r
- vmin = fmin(vmin, b);\r
+ vmax = vmin = r;\r
+ vmax = fmax(vmax, g);\r
+ vmax = fmax(vmax, b);\r
+ vmin = fmin(vmin, g);\r
+ vmin = fmin(vmin, b);\r
\r
- diff = vmax - vmin;\r
- l = (vmax + vmin) * 0.5f;\r
+ diff = vmax - vmin;\r
+ l = (vmax + vmin) * 0.5f;\r
\r
- if (diff > numeric_limits<float>::epsilon())\r
- {\r
- s = (l < 0.5f) * diff / (vmax + vmin);\r
- s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);\r
+ if (diff > numeric_limits<float>::epsilon())\r
+ {\r
+ s = (l < 0.5f) * diff / (vmax + vmin);\r
+ s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);\r
\r
- diff = 60.f / diff;\r
+ diff = 60.f / diff;\r
\r
- h = (vmax == r) * (g - b) * diff;\r
- h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);\r
- h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);\r
- h += (h < 0.f) * 360.f;\r
- }\r
+ h = (vmax == r) * (g - b) * diff;\r
+ h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);\r
+ h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);\r
+ h += (h < 0.f) * 360.f;\r
+ }\r
\r
- dst.x = h * hscale;\r
- dst.y = l;\r
- dst.z = s;\r
- }\r
- template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)\r
- {\r
- float3 buf;\r
+ dst.x = h * hscale;\r
+ dst.y = l;\r
+ dst.z = s;\r
+ }\r
+ template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)\r
+ {\r
+ float3 buf;\r
\r
- buf.x = src[0] * (1.f / 255.f);\r
- buf.y = src[1] * (1.f / 255.f);\r
- buf.z = src[2] * (1.f / 255.f);\r
+ buf.x = src[0] * (1.f / 255.f);\r
+ buf.y = src[1] * (1.f / 255.f);\r
+ buf.z = src[2] * (1.f / 255.f);\r
\r
- RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
+ RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
\r
- dst.x = saturate_cast<uchar>(buf.x);\r
- dst.y = saturate_cast<uchar>(buf.y*255.f);\r
- dst.z = saturate_cast<uchar>(buf.z*255.f);\r
- }\r
- template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)\r
- {\r
- float3 buf;\r
+ dst.x = saturate_cast<uchar>(buf.x);\r
+ dst.y = saturate_cast<uchar>(buf.y*255.f);\r
+ dst.z = saturate_cast<uchar>(buf.z*255.f);\r
+ }\r
+ template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)\r
+ {\r
+ float3 buf;\r
\r
- buf.x = (0xff & src) * (1.f / 255.f);\r
- buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
- buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
+ buf.x = (0xff & src) * (1.f / 255.f);\r
+ buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
+ buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
\r
- RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
+ RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
\r
- uint dst = 0xffu << 24;\r
+ uint dst = 0xffu << 24;\r
\r
- dst |= saturate_cast<uchar>(buf.x);\r
- dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
- dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+ dst |= saturate_cast<uchar>(buf.x);\r
+ dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+ dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
\r
- return dst;\r
- }\r
+ return dst;\r
+ }\r
\r
- template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
{\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
\r
- RGB2HLSConvert<bidx, hr>(&src.x, dst);\r
+ RGB2HLSConvert<bidx, hr>(&src.x, dst);\r
\r
- return dst;\r
- }\r
- };\r
- template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
{\r
- return RGB2HLSConvert<bidx, hr>(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return RGB2HLSConvert<bidx, hr>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <typename T> struct name ## _full_traits \\r
{ \\r
- typedef detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _traits<float> \\r
{ \\r
- typedef detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _full_traits<float> \\r
{ \\r
- typedef detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
\r
-namespace detail\r
-{\r
- __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
-\r
- template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)\r
+ namespace color_detail\r
{\r
- const float hscale = 6.0f / hr;\r
+ __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
\r
- float h = src.x, l = src.y, s = src.z;\r
- float b = l, g = l, r = l;\r
-\r
- if (s != 0)\r
+ template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)\r
{\r
- float p2 = (l <= 0.5f) * l * (1 + s);\r
- p2 += (l > 0.5f) * (l + s - l * s);\r
- float p1 = 2 * l - p2;\r
-\r
- h *= hscale;\r
+ const float hscale = 6.0f / hr;\r
\r
- if( h < 0 )\r
- do h += 6; while( h < 0 );\r
- else if( h >= 6 )\r
- do h -= 6; while( h >= 6 );\r
+ float h = src.x, l = src.y, s = src.z;\r
+ float b = l, g = l, r = l;\r
\r
- int sector;\r
- sector = __float2int_rd(h);\r
+ if (s != 0)\r
+ {\r
+ float p2 = (l <= 0.5f) * l * (1 + s);\r
+ p2 += (l > 0.5f) * (l + s - l * s);\r
+ float p1 = 2 * l - p2;\r
\r
- h -= sector;\r
+ h *= hscale;\r
\r
- float tab[4];\r
- tab[0] = p2;\r
- tab[1] = p1;\r
- tab[2] = p1 + (p2 - p1) * (1 - h);\r
- tab[3] = p1 + (p2 - p1) * h;\r
+ if( h < 0 )\r
+ do h += 6; while( h < 0 );\r
+ else if( h >= 6 )\r
+ do h -= 6; while( h >= 6 );\r
\r
- b = tab[c_HlsSectorData[sector][0]];\r
- g = tab[c_HlsSectorData[sector][1]];\r
- r = tab[c_HlsSectorData[sector][2]];\r
- }\r
+ int sector;\r
+ sector = __float2int_rd(h);\r
\r
- dst[bidx] = b;\r
- dst[1] = g;\r
- dst[bidx^2] = r;\r
- }\r
- template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)\r
- {\r
- float3 buf;\r
+ h -= sector;\r
\r
- buf.x = src.x;\r
- buf.y = src.y * (1.f / 255.f);\r
- buf.z = src.z * (1.f / 255.f);\r
+ float tab[4];\r
+ tab[0] = p2;\r
+ tab[1] = p1;\r
+ tab[2] = p1 + (p2 - p1) * (1 - h);\r
+ tab[3] = p1 + (p2 - p1) * h;\r
\r
- HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
+ b = tab[c_HlsSectorData[sector][0]];\r
+ g = tab[c_HlsSectorData[sector][1]];\r
+ r = tab[c_HlsSectorData[sector][2]];\r
+ }\r
\r
- dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
- dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
- dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
- }\r
- template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)\r
- {\r
- float3 buf;\r
+ dst[bidx] = b;\r
+ dst[1] = g;\r
+ dst[bidx^2] = r;\r
+ }\r
+ template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)\r
+ {\r
+ float3 buf;\r
\r
- buf.x = 0xff & src;\r
- buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
- buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
+ buf.x = src.x;\r
+ buf.y = src.y * (1.f / 255.f);\r
+ buf.z = src.z * (1.f / 255.f);\r
\r
- HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
+ HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
\r
- uint dst = 0xffu << 24;\r
+ dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
+ dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
+ dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+ }\r
+ template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)\r
+ {\r
+ float3 buf;\r
\r
- dst |= saturate_cast<uchar>(buf.x * 255.f);\r
- dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
- dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+ buf.x = 0xff & src;\r
+ buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
+ buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
\r
- return dst;\r
- }\r
+ HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
\r
- template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
- {\r
- __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
- {\r
- typename TypeVec<T, dcn>::vec_type dst;\r
+ uint dst = 0xffu << 24;\r
\r
- HLS2RGBConvert<bidx, hr>(src, &dst.x);\r
- setAlpha(dst, ColorChannel<T>::max());\r
+ dst |= saturate_cast<uchar>(buf.x * 255.f);\r
+ dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+ dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
\r
return dst;\r
}\r
- };\r
- template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
- {\r
- __device__ __forceinline__ uint operator()(uint src) const\r
+\r
+ template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
{\r
- return HLS2RGBConvert<bidx, hr>(src);\r
- }\r
- };\r
-}\r
+ __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
+ {\r
+ typename TypeVec<T, dcn>::vec_type dst;\r
+\r
+ HLS2RGBConvert<bidx, hr>(src, &dst.x);\r
+ setAlpha(dst, ColorChannel<T>::max());\r
+\r
+ return dst;\r
+ }\r
+ };\r
+ template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+ {\r
+ __device__ __forceinline__ uint operator()(uint src) const\r
+ {\r
+ return HLS2RGBConvert<bidx, hr>(src);\r
+ }\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \\r
template <typename T> struct name ## _traits \\r
{ \\r
- typedef detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <typename T> struct name ## _full_traits \\r
{ \\r
- typedef detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _traits<float> \\r
{ \\r
- typedef detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
}; \\r
template <> struct name ## _full_traits<float> \\r
{ \\r
- typedef detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
+ typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \\r
static __host__ __device__ __forceinline__ functor_type create_functor() \\r
{ \\r
return functor_type(); \\r
} \\r
};\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_COLOR_DETAIL_HPP__\r
#include "../vec_traits.hpp"\r
#include "../functional.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device \r
{\r
- //! Mask accessor\r
-\r
- struct MaskReader\r
- {\r
- explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}\r
-\r
- __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }\r
-\r
- const PtrStepb mask;\r
- };\r
-\r
- struct NoMask \r
+ namespace transform_detail\r
{\r
- __device__ __forceinline__ bool operator()(int y, int x) const { return true; } \r
- };\r
+ //! Read Write Traits\r
\r
- //! Read Write Traits\r
-\r
- template <typename T, typename D, int shift> struct UnaryReadWriteTraits\r
- {\r
- typedef typename TypeVec<T, shift>::vec_type read_type;\r
- typedef typename TypeVec<D, shift>::vec_type write_type;\r
- };\r
+ template <typename T, typename D, int shift> struct UnaryReadWriteTraits\r
+ {\r
+ typedef typename TypeVec<T, shift>::vec_type read_type;\r
+ typedef typename TypeVec<D, shift>::vec_type write_type;\r
+ };\r
\r
- template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits\r
- {\r
- typedef typename TypeVec<T1, shift>::vec_type read_type1;\r
- typedef typename TypeVec<T2, shift>::vec_type read_type2;\r
- typedef typename TypeVec<D, shift>::vec_type write_type;\r
- };\r
+ template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits\r
+ {\r
+ typedef typename TypeVec<T1, shift>::vec_type read_type1;\r
+ typedef typename TypeVec<T2, shift>::vec_type read_type2;\r
+ typedef typename TypeVec<D, shift>::vec_type write_type;\r
+ };\r
\r
- //! Transform kernels\r
+ //! Transform kernels\r
\r
- template <int shift> struct OpUnroller;\r
- template <> struct OpUnroller<1>\r
- {\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
+ template <int shift> struct OpUnroller;\r
+ template <> struct OpUnroller<1>\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src.x);\r
- }\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src.x);\r
+ }\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
- {\r
- if (mask(y, x_shifted))\r
- dst.x = op(src1.x, src2.x);\r
- }\r
- };\r
- template <> struct OpUnroller<2>\r
- {\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src1.x, src2.x);\r
+ }\r
+ };\r
+ template <> struct OpUnroller<2>\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src.y);\r
- }\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src.y);\r
+ }\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src1.x, src2.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src1.y, src2.y);\r
+ }\r
+ };\r
+ template <> struct OpUnroller<3>\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src1.x, src2.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src1.y, src2.y);\r
- }\r
- };\r
- template <> struct OpUnroller<3>\r
- {\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src.y);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.z = op(src.z);\r
+ }\r
+\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src1.x, src2.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src1.y, src2.y);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.z = op(src1.z, src2.z);\r
+ }\r
+ };\r
+ template <> struct OpUnroller<4>\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src.y);\r
- if (mask(y, x_shifted + 2))\r
- dst.z = op(src.z);\r
- }\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src.y);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.z = op(src.z);\r
+ if (mask(y, x_shifted + 3))\r
+ dst.w = op(src.w);\r
+ }\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.x = op(src1.x, src2.x);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.y = op(src1.y, src2.y);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.z = op(src1.z, src2.z);\r
+ if (mask(y, x_shifted + 3))\r
+ dst.w = op(src1.w, src2.w);\r
+ }\r
+ };\r
+ template <> struct OpUnroller<8>\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src1.x, src2.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src1.y, src2.y);\r
- if (mask(y, x_shifted + 2))\r
- dst.z = op(src1.z, src2.z);\r
- }\r
- };\r
- template <> struct OpUnroller<4>\r
- {\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.a0 = op(src.a0);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.a1 = op(src.a1);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.a2 = op(src.a2);\r
+ if (mask(y, x_shifted + 3))\r
+ dst.a3 = op(src.a3);\r
+ if (mask(y, x_shifted + 4))\r
+ dst.a4 = op(src.a4);\r
+ if (mask(y, x_shifted + 5))\r
+ dst.a5 = op(src.a5);\r
+ if (mask(y, x_shifted + 6))\r
+ dst.a6 = op(src.a6);\r
+ if (mask(y, x_shifted + 7))\r
+ dst.a7 = op(src.a7);\r
+ }\r
+\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+ {\r
+ if (mask(y, x_shifted))\r
+ dst.a0 = op(src1.a0, src2.a0);\r
+ if (mask(y, x_shifted + 1))\r
+ dst.a1 = op(src1.a1, src2.a1);\r
+ if (mask(y, x_shifted + 2))\r
+ dst.a2 = op(src1.a2, src2.a2);\r
+ if (mask(y, x_shifted + 3))\r
+ dst.a3 = op(src1.a3, src2.a3);\r
+ if (mask(y, x_shifted + 4))\r
+ dst.a4 = op(src1.a4, src2.a4);\r
+ if (mask(y, x_shifted + 5))\r
+ dst.a5 = op(src1.a5, src2.a5);\r
+ if (mask(y, x_shifted + 6))\r
+ dst.a6 = op(src1.a6, src2.a6);\r
+ if (mask(y, x_shifted + 7))\r
+ dst.a7 = op(src1.a7, src2.a7);\r
+ }\r
+ };\r
+\r
template <typename T, typename D, typename UnOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)\r
{\r
- if (mask(y, x_shifted))\r
- dst.x = op(src.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src.y);\r
- if (mask(y, x_shifted + 2))\r
- dst.z = op(src.z);\r
- if (mask(y, x_shifted + 3))\r
- dst.w = op(src.w);\r
- }\r
+ typedef TransformFunctorTraits<UnOp> ft;\r
+ typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;\r
+ typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
- {\r
- if (mask(y, x_shifted))\r
- dst.x = op(src1.x, src2.x);\r
- if (mask(y, x_shifted + 1))\r
- dst.y = op(src1.y, src2.y);\r
- if (mask(y, x_shifted + 2))\r
- dst.z = op(src1.z, src2.z);\r
- if (mask(y, x_shifted + 3))\r
- dst.w = op(src1.w, src2.w);\r
+ const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
+ const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
+ const int x_shifted = x * ft::smart_shift;\r
+\r
+ if (y < src_.rows)\r
+ {\r
+ const T* src = src_.ptr(y);\r
+ D* dst = dst_.ptr(y);\r
+\r
+ if (x_shifted + ft::smart_shift - 1 < src_.cols)\r
+ {\r
+ const read_type src_n_el = ((const read_type*)src)[x];\r
+ write_type dst_n_el;\r
+\r
+ OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);\r
+\r
+ ((write_type*)dst)[x] = dst_n_el;\r
+ }\r
+ else\r
+ {\r
+ for (int real_x = x_shifted; real_x < src_.cols; ++real_x)\r
+ {\r
+ if (mask(y, real_x))\r
+ dst[real_x] = op(src[real_x]);\r
+ }\r
+ }\r
+ }\r
}\r
- };\r
- template <> struct OpUnroller<8>\r
- {\r
+\r
template <typename T, typename D, typename UnOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
+ static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)\r
{\r
- if (mask(y, x_shifted))\r
- dst.a0 = op(src.a0);\r
- if (mask(y, x_shifted + 1))\r
- dst.a1 = op(src.a1);\r
- if (mask(y, x_shifted + 2))\r
- dst.a2 = op(src.a2);\r
- if (mask(y, x_shifted + 3))\r
- dst.a3 = op(src.a3);\r
- if (mask(y, x_shifted + 4))\r
- dst.a4 = op(src.a4);\r
- if (mask(y, x_shifted + 5))\r
- dst.a5 = op(src.a5);\r
- if (mask(y, x_shifted + 6))\r
- dst.a6 = op(src.a6);\r
- if (mask(y, x_shifted + 7))\r
- dst.a7 = op(src.a7);\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < src.cols && y < src.rows && mask(y, x))\r
+ {\r
+ dst.ptr(y)[x] = op(src.ptr(y)[x]);\r
+ }\r
}\r
\r
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+ __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, \r
+ const Mask mask, const BinOp op)\r
{\r
- if (mask(y, x_shifted))\r
- dst.a0 = op(src1.a0, src2.a0);\r
- if (mask(y, x_shifted + 1))\r
- dst.a1 = op(src1.a1, src2.a1);\r
- if (mask(y, x_shifted + 2))\r
- dst.a2 = op(src1.a2, src2.a2);\r
- if (mask(y, x_shifted + 3))\r
- dst.a3 = op(src1.a3, src2.a3);\r
- if (mask(y, x_shifted + 4))\r
- dst.a4 = op(src1.a4, src2.a4);\r
- if (mask(y, x_shifted + 5))\r
- dst.a5 = op(src1.a5, src2.a5);\r
- if (mask(y, x_shifted + 6))\r
- dst.a6 = op(src1.a6, src2.a6);\r
- if (mask(y, x_shifted + 7))\r
- dst.a7 = op(src1.a7, src2.a7);\r
- }\r
- };\r
-\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)\r
- {\r
- typedef TransformFunctorTraits<UnOp> ft;\r
- typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;\r
- typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;\r
-\r
- const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
- const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
- const int x_shifted = x * ft::smart_shift;\r
+ typedef TransformFunctorTraits<BinOp> ft;\r
+ typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;\r
+ typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;\r
+ typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;\r
\r
- if (y < src_.rows)\r
- {\r
- const T* src = src_.ptr(y);\r
- D* dst = dst_.ptr(y);\r
+ const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
+ const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
+ const int x_shifted = x * ft::smart_shift;\r
\r
- if (x_shifted + ft::smart_shift - 1 < src_.cols)\r
+ if (y < src1_.rows)\r
{\r
- const read_type src_n_el = ((const read_type*)src)[x];\r
- write_type dst_n_el;\r
+ const T1* src1 = src1_.ptr(y);\r
+ const T2* src2 = src2_.ptr(y);\r
+ D* dst = dst_.ptr(y);\r
\r
- OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);\r
+ if (x_shifted + ft::smart_shift - 1 < src1_.cols)\r
+ {\r
+ const read_type1 src1_n_el = ((const read_type1*)src1)[x];\r
+ const read_type2 src2_n_el = ((const read_type2*)src2)[x];\r
+ write_type dst_n_el;\r
+ \r
+ OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);\r
\r
- ((write_type*)dst)[x] = dst_n_el;\r
- }\r
- else\r
- {\r
- for (int real_x = x_shifted; real_x < src_.cols; ++real_x)\r
+ ((write_type*)dst)[x] = dst_n_el;\r
+ }\r
+ else\r
{\r
- if (mask(y, real_x))\r
- dst[real_x] = op(src[real_x]);\r
+ for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)\r
+ {\r
+ if (mask(y, real_x))\r
+ dst[real_x] = op(src1[real_x], src2[real_x]);\r
+ }\r
}\r
}\r
}\r
- }\r
\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)\r
- {\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
- if (x < src.cols && y < src.rows && mask(y, x))\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, \r
+ const Mask mask, const BinOp op)\r
{\r
- dst.ptr(y)[x] = op(src.ptr(y)[x]);\r
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+ if (x < src1.cols && y < src1.rows && mask(y, x))\r
+ {\r
+ const T1 src1_data = src1.ptr(y)[x];\r
+ const T2 src2_data = src2.ptr(y)[x];\r
+ dst.ptr(y)[x] = op(src1_data, src2_data);\r
+ }\r
}\r
- }\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, \r
- const Mask mask, const BinOp op)\r
- {\r
- typedef TransformFunctorTraits<BinOp> ft;\r
- typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;\r
- typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;\r
- typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;\r
+ template <bool UseSmart> struct TransformDispatcher;\r
+ template<> struct TransformDispatcher<false>\r
+ {\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+ {\r
+ typedef TransformFunctorTraits<UnOp> ft;\r
\r
- const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
- const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
- const int x_shifted = x * ft::smart_shift;\r
+ const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
+ const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); \r
\r
- if (y < src1_.rows)\r
- {\r
- const T1* src1 = src1_.ptr(y);\r
- const T2* src2 = src2_.ptr(y);\r
- D* dst = dst_.ptr(y);\r
+ transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \r
+ }\r
\r
- if (x_shifted + ft::smart_shift - 1 < src1_.cols)\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
{\r
- const read_type1 src1_n_el = ((const read_type1*)src1)[x];\r
- const read_type2 src2_n_el = ((const read_type2*)src2)[x];\r
- write_type dst_n_el;\r
- \r
- OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);\r
+ typedef TransformFunctorTraits<BinOp> ft;\r
\r
- ((write_type*)dst)[x] = dst_n_el;\r
+ const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
+ const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); \r
+\r
+ transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \r
}\r
- else\r
+ };\r
+ template<> struct TransformDispatcher<true>\r
+ {\r
+ template <typename T, typename D, typename UnOp, typename Mask>\r
+ static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
{\r
- for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)\r
- {\r
- if (mask(y, real_x))\r
- dst[real_x] = op(src1[real_x], src2[real_x]);\r
- }\r
- }\r
- }\r
- }\r
+ typedef TransformFunctorTraits<UnOp> ft;\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, \r
- const Mask mask, const BinOp op)\r
- {\r
- const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
- const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+ StaticAssert<ft::smart_shift != 1>::check();\r
\r
- if (x < src1.cols && y < src1.rows && mask(y, x))\r
- {\r
- const T1 src1_data = src1.ptr(y)[x];\r
- const T2 src2_data = src2.ptr(y)[x];\r
- dst.ptr(y)[x] = op(src1_data, src2_data);\r
- }\r
- }\r
+ const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
+ const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); \r
\r
- template <bool UseSmart> struct TransformDispatcher;\r
- template<> struct TransformDispatcher<false>\r
- {\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
- {\r
- typedef TransformFunctorTraits<UnOp> ft;\r
+ transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
- const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); \r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
\r
- transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
- cudaSafeCall( cudaGetLastError() );\r
+ template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+ static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+ {\r
+ typedef TransformFunctorTraits<BinOp> ft;\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
- }\r
+ StaticAssert<ft::smart_shift != 1>::check();\r
\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
- {\r
- typedef TransformFunctorTraits<BinOp> ft;\r
+ const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
+ const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); \r
\r
- const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
- const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); \r
+ transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+ cudaSafeCall( cudaGetLastError() );\r
\r
- transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
- cudaSafeCall( cudaGetLastError() );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() ); \r
+ }\r
+ }; \r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
- }\r
- };\r
- template<> struct TransformDispatcher<true>\r
- {\r
template <typename T, typename D, typename UnOp, typename Mask>\r
- static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+ static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
{\r
typedef TransformFunctorTraits<UnOp> ft;\r
-\r
- StaticAssert<ft::smart_shift != 1>::check();\r
-\r
- const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
- const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); \r
-\r
- transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);\r
}\r
\r
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+ static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
{\r
typedef TransformFunctorTraits<BinOp> ft;\r
-\r
- StaticAssert<ft::smart_shift != 1>::check();\r
-\r
- const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
- const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); \r
-\r
- transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
- cudaSafeCall( cudaGetLastError() );\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() ); \r
+ TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);\r
}\r
- }; \r
-\r
- template <typename T, typename D, typename UnOp, typename Mask>\r
- static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
- {\r
- typedef TransformFunctorTraits<UnOp> ft;\r
- TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);\r
- }\r
-\r
- template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
- static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
- {\r
- typedef TransformFunctorTraits<BinOp> ft;\r
- TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);\r
- }\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ } // namespace transform_detail\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__\r
#include "internal_shared.hpp"\r
#include "../vec_traits.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <bool, typename T1, typename T2> struct Select { typedef T1 type; };\r
- template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };\r
-\r
- template <typename T> struct IsSignedIntergral { enum {value = 0}; };\r
- template <> struct IsSignedIntergral<schar> { enum {value = 1}; };\r
- template <> struct IsSignedIntergral<char1> { enum {value = 1}; };\r
- template <> struct IsSignedIntergral<short> { enum {value = 1}; };\r
- template <> struct IsSignedIntergral<short1> { enum {value = 1}; };\r
- template <> struct IsSignedIntergral<int> { enum {value = 1}; };\r
- template <> struct IsSignedIntergral<int1> { enum {value = 1}; };\r
+ namespace type_traits_detail\r
+ {\r
+ template <bool, typename T1, typename T2> struct Select { typedef T1 type; };\r
+ template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };\r
\r
- template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };\r
- template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };\r
- template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };\r
- template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };\r
- template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };\r
- template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };\r
- template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };\r
+ template <typename T> struct IsSignedIntergral { enum {value = 0}; };\r
+ template <> struct IsSignedIntergral<schar> { enum {value = 1}; };\r
+ template <> struct IsSignedIntergral<char1> { enum {value = 1}; };\r
+ template <> struct IsSignedIntergral<short> { enum {value = 1}; };\r
+ template <> struct IsSignedIntergral<short1> { enum {value = 1}; };\r
+ template <> struct IsSignedIntergral<int> { enum {value = 1}; };\r
+ template <> struct IsSignedIntergral<int1> { enum {value = 1}; };\r
\r
- template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };\r
- template <> struct IsIntegral<char> { enum {value = 1}; };\r
- template <> struct IsIntegral<bool> { enum {value = 1}; };\r
+ template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };\r
+ template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };\r
+ template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };\r
+ template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };\r
+ template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };\r
+ template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };\r
+ template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };\r
\r
- template <typename T> struct IsFloat { enum {value = 0}; };\r
- template <> struct IsFloat<float> { enum {value = 1}; };\r
- template <> struct IsFloat<double> { enum {value = 1}; };\r
+ template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };\r
+ template <> struct IsIntegral<char> { enum {value = 1}; };\r
+ template <> struct IsIntegral<bool> { enum {value = 1}; };\r
\r
- template <typename T> struct IsVec { enum {value = 0}; };\r
- template <> struct IsVec<uchar1> { enum {value = 1}; };\r
- template <> struct IsVec<uchar2> { enum {value = 1}; };\r
- template <> struct IsVec<uchar3> { enum {value = 1}; };\r
- template <> struct IsVec<uchar4> { enum {value = 1}; };\r
- template <> struct IsVec<uchar8> { enum {value = 1}; };\r
- template <> struct IsVec<char1> { enum {value = 1}; };\r
- template <> struct IsVec<char2> { enum {value = 1}; };\r
- template <> struct IsVec<char3> { enum {value = 1}; };\r
- template <> struct IsVec<char4> { enum {value = 1}; };\r
- template <> struct IsVec<char8> { enum {value = 1}; };\r
- template <> struct IsVec<ushort1> { enum {value = 1}; };\r
- template <> struct IsVec<ushort2> { enum {value = 1}; };\r
- template <> struct IsVec<ushort3> { enum {value = 1}; };\r
- template <> struct IsVec<ushort4> { enum {value = 1}; };\r
- template <> struct IsVec<ushort8> { enum {value = 1}; };\r
- template <> struct IsVec<short1> { enum {value = 1}; };\r
- template <> struct IsVec<short2> { enum {value = 1}; };\r
- template <> struct IsVec<short3> { enum {value = 1}; };\r
- template <> struct IsVec<short4> { enum {value = 1}; };\r
- template <> struct IsVec<short8> { enum {value = 1}; };\r
- template <> struct IsVec<uint1> { enum {value = 1}; };\r
- template <> struct IsVec<uint2> { enum {value = 1}; };\r
- template <> struct IsVec<uint3> { enum {value = 1}; };\r
- template <> struct IsVec<uint4> { enum {value = 1}; };\r
- template <> struct IsVec<uint8> { enum {value = 1}; };\r
- template <> struct IsVec<int1> { enum {value = 1}; };\r
- template <> struct IsVec<int2> { enum {value = 1}; };\r
- template <> struct IsVec<int3> { enum {value = 1}; };\r
- template <> struct IsVec<int4> { enum {value = 1}; };\r
- template <> struct IsVec<int8> { enum {value = 1}; };\r
- template <> struct IsVec<float1> { enum {value = 1}; };\r
- template <> struct IsVec<float2> { enum {value = 1}; };\r
- template <> struct IsVec<float3> { enum {value = 1}; };\r
- template <> struct IsVec<float4> { enum {value = 1}; };\r
- template <> struct IsVec<float8> { enum {value = 1}; };\r
- template <> struct IsVec<double1> { enum {value = 1}; };\r
- template <> struct IsVec<double2> { enum {value = 1}; };\r
- template <> struct IsVec<double3> { enum {value = 1}; };\r
- template <> struct IsVec<double4> { enum {value = 1}; };\r
- template <> struct IsVec<double8> { enum {value = 1}; };\r
+ template <typename T> struct IsFloat { enum {value = 0}; };\r
+ template <> struct IsFloat<float> { enum {value = 1}; };\r
+ template <> struct IsFloat<double> { enum {value = 1}; };\r
\r
- template <class U> struct AddParameterType { typedef const U& type; };\r
- template <class U> struct AddParameterType<U&> { typedef U& type; };\r
- template <> struct AddParameterType<void> { typedef void type; };\r
+ template <typename T> struct IsVec { enum {value = 0}; };\r
+ template <> struct IsVec<uchar1> { enum {value = 1}; };\r
+ template <> struct IsVec<uchar2> { enum {value = 1}; };\r
+ template <> struct IsVec<uchar3> { enum {value = 1}; };\r
+ template <> struct IsVec<uchar4> { enum {value = 1}; };\r
+ template <> struct IsVec<uchar8> { enum {value = 1}; };\r
+ template <> struct IsVec<char1> { enum {value = 1}; };\r
+ template <> struct IsVec<char2> { enum {value = 1}; };\r
+ template <> struct IsVec<char3> { enum {value = 1}; };\r
+ template <> struct IsVec<char4> { enum {value = 1}; };\r
+ template <> struct IsVec<char8> { enum {value = 1}; };\r
+ template <> struct IsVec<ushort1> { enum {value = 1}; };\r
+ template <> struct IsVec<ushort2> { enum {value = 1}; };\r
+ template <> struct IsVec<ushort3> { enum {value = 1}; };\r
+ template <> struct IsVec<ushort4> { enum {value = 1}; };\r
+ template <> struct IsVec<ushort8> { enum {value = 1}; };\r
+ template <> struct IsVec<short1> { enum {value = 1}; };\r
+ template <> struct IsVec<short2> { enum {value = 1}; };\r
+ template <> struct IsVec<short3> { enum {value = 1}; };\r
+ template <> struct IsVec<short4> { enum {value = 1}; };\r
+ template <> struct IsVec<short8> { enum {value = 1}; };\r
+ template <> struct IsVec<uint1> { enum {value = 1}; };\r
+ template <> struct IsVec<uint2> { enum {value = 1}; };\r
+ template <> struct IsVec<uint3> { enum {value = 1}; };\r
+ template <> struct IsVec<uint4> { enum {value = 1}; };\r
+ template <> struct IsVec<uint8> { enum {value = 1}; };\r
+ template <> struct IsVec<int1> { enum {value = 1}; };\r
+ template <> struct IsVec<int2> { enum {value = 1}; };\r
+ template <> struct IsVec<int3> { enum {value = 1}; };\r
+ template <> struct IsVec<int4> { enum {value = 1}; };\r
+ template <> struct IsVec<int8> { enum {value = 1}; };\r
+ template <> struct IsVec<float1> { enum {value = 1}; };\r
+ template <> struct IsVec<float2> { enum {value = 1}; };\r
+ template <> struct IsVec<float3> { enum {value = 1}; };\r
+ template <> struct IsVec<float4> { enum {value = 1}; };\r
+ template <> struct IsVec<float8> { enum {value = 1}; };\r
+ template <> struct IsVec<double1> { enum {value = 1}; };\r
+ template <> struct IsVec<double2> { enum {value = 1}; };\r
+ template <> struct IsVec<double3> { enum {value = 1}; };\r
+ template <> struct IsVec<double4> { enum {value = 1}; };\r
+ template <> struct IsVec<double8> { enum {value = 1}; };\r
\r
- template <class U> struct ReferenceTraits \r
- {\r
- enum { value = false };\r
- typedef U type;\r
- }; \r
- template <class U> struct ReferenceTraits<U&>\r
- {\r
- enum { value = true };\r
- typedef U type;\r
- };\r
- \r
- template <class U> struct PointerTraits\r
- {\r
- enum { value = false };\r
- typedef void type;\r
- }; \r
- template <class U> struct PointerTraits<U*>\r
- {\r
- enum { value = true };\r
- typedef U type;\r
- }; \r
- template <class U> struct PointerTraits<U*&>\r
- {\r
- enum { value = true };\r
- typedef U type;\r
- };\r
- \r
- template <class U> struct UnConst\r
- {\r
- typedef U type;\r
- enum { value = 0 };\r
- }; \r
- template <class U> struct UnConst<const U>\r
- {\r
- typedef U type;\r
- enum { value = 1 };\r
- };\r
- template <class U> struct UnConst<const U&>\r
- {\r
- typedef U& type;\r
- enum { value = 1 };\r
- };\r
+ template <class U> struct AddParameterType { typedef const U& type; };\r
+ template <class U> struct AddParameterType<U&> { typedef U& type; };\r
+ template <> struct AddParameterType<void> { typedef void type; };\r
\r
- template <class U> struct UnVolatile\r
- {\r
- typedef U type;\r
- enum { value = 0 };\r
- }; \r
- template <class U> struct UnVolatile<volatile U>\r
- {\r
- typedef U type;\r
- enum { value = 1 };\r
- };\r
- template <class U> struct UnVolatile<volatile U&>\r
- {\r
- typedef U& type;\r
- enum { value = 1 };\r
- };\r
-}\r
+ template <class U> struct ReferenceTraits \r
+ {\r
+ enum { value = false };\r
+ typedef U type;\r
+ }; \r
+ template <class U> struct ReferenceTraits<U&>\r
+ {\r
+ enum { value = true };\r
+ typedef U type;\r
+ };\r
+ \r
+ template <class U> struct PointerTraits\r
+ {\r
+ enum { value = false };\r
+ typedef void type;\r
+ }; \r
+ template <class U> struct PointerTraits<U*>\r
+ {\r
+ enum { value = true };\r
+ typedef U type;\r
+ }; \r
+ template <class U> struct PointerTraits<U*&>\r
+ {\r
+ enum { value = true };\r
+ typedef U type;\r
+ };\r
+ \r
+ template <class U> struct UnConst\r
+ {\r
+ typedef U type;\r
+ enum { value = 0 };\r
+ }; \r
+ template <class U> struct UnConst<const U>\r
+ {\r
+ typedef U type;\r
+ enum { value = 1 };\r
+ };\r
+ template <class U> struct UnConst<const U&>\r
+ {\r
+ typedef U& type;\r
+ enum { value = 1 };\r
+ };\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <class U> struct UnVolatile\r
+ {\r
+ typedef U type;\r
+ enum { value = 0 };\r
+ }; \r
+ template <class U> struct UnVolatile<volatile U>\r
+ {\r
+ typedef U type;\r
+ enum { value = 1 };\r
+ };\r
+ template <class U> struct UnVolatile<volatile U&>\r
+ {\r
+ typedef U& type;\r
+ enum { value = 1 };\r
+ };\r
+ } // namespace type_traits_detail\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device \r
{\r
- ///////////////////////////////////////////////////////////////////////////////\r
- // Reduction\r
-\r
- template <int n> struct WarpReductor\r
+ namespace utility_detail\r
{\r
- template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- if (tid < n)\r
- data[tid] = partial_reduction; \r
- if (n > 32) __syncthreads();\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Reduction\r
\r
- if (n > 32)\r
+ template <int n> struct WarpReductor\r
+ {\r
+ template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- if (tid < n - 32) \r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
- if (tid < 16)\r
+ if (tid < n)\r
+ data[tid] = partial_reduction; \r
+ if (n > 32) __syncthreads();\r
+\r
+ if (n > 32)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ if (tid < n - 32) \r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+ if (tid < 16)\r
+ {\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ }\r
}\r
- }\r
- else if (n > 16)\r
- {\r
- if (tid < n - 16) \r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
- if (tid < 8)\r
+ else if (n > 16)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ if (tid < n - 16) \r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+ if (tid < 8)\r
+ {\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ }\r
}\r
- }\r
- else if (n > 8)\r
- {\r
- if (tid < n - 8) \r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
- if (tid < 4)\r
+ else if (n > 8)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ if (tid < n - 8) \r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
+ if (tid < 4)\r
+ {\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ }\r
}\r
- }\r
- else if (n > 4)\r
- {\r
- if (tid < n - 4) \r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
- if (tid < 2)\r
+ else if (n > 4)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
- }\r
- } \r
- else if (n > 2)\r
- {\r
- if (tid < n - 2) \r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- if (tid < 2)\r
+ if (tid < n - 4) \r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
+ if (tid < 2)\r
+ {\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ }\r
+ } \r
+ else if (n > 2)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
- }\r
- } \r
- }\r
- };\r
- template <> struct WarpReductor<64>\r
- {\r
- template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- data[tid] = partial_reduction;\r
- __syncthreads();\r
- \r
- if (tid < 32) \r
- {\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
- }\r
- }\r
- };\r
- template <> struct WarpReductor<32>\r
- {\r
- template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- data[tid] = partial_reduction;\r
- \r
- if (tid < 16) \r
- {\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
- }\r
- }\r
- };\r
- template <> struct WarpReductor<16>\r
- {\r
- template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- data[tid] = partial_reduction;\r
- \r
- if (tid < 8) \r
- {\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
+ if (tid < n - 2) \r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ if (tid < 2)\r
+ {\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
+ }\r
+ } \r
}\r
- }\r
- };\r
- template <> struct WarpReductor<8>\r
- {\r
- template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+ };\r
+ template <> struct WarpReductor<64>\r
{\r
- data[tid] = partial_reduction;\r
- \r
- if (tid < 4) \r
+ template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
- }\r
- }\r
- };\r
-\r
- template <bool warp> struct ReductionDispatcher;\r
- template <> struct ReductionDispatcher<true>\r
- {\r
- template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- WarpReductor<n>::reduce(data, partial_reduction, tid, op);\r
- }\r
- };\r
- template <> struct ReductionDispatcher<false>\r
- {\r
- template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
- {\r
- if (tid < n)\r
data[tid] = partial_reduction;\r
- __syncthreads();\r
-\r
-\r
- if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }\r
- if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }\r
- if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }\r
-\r
- if (tid < 32)\r
- {\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
- data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
- }\r
- }\r
- };\r
-\r
- ///////////////////////////////////////////////////////////////////////////////\r
- // PredValWarpReductor\r
- \r
- template <int n> struct PredValWarpReductor;\r
- template <> struct PredValWarpReductor<64>\r
- {\r
- template <typename T, typename V, typename Pred> \r
- static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
- {\r
- if (tid < 32)\r
- {\r
- myData = sdata[tid];\r
- myVal = sval[tid];\r
-\r
- T reg = sdata[tid + 32];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 32];\r
- }\r
-\r
- reg = sdata[tid + 16];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 16];\r
- }\r
-\r
- reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 8];\r
- }\r
-\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
+ __syncthreads();\r
+ \r
+ if (tid < 32) \r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 1];\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
}\r
}\r
- }\r
- };\r
- template <> struct PredValWarpReductor<32>\r
- {\r
- template <typename T, typename V, typename Pred> \r
- static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ };\r
+ template <> struct WarpReductor<32>\r
{\r
- if (tid < 16)\r
+ template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- myData = sdata[tid];\r
- myVal = sval[tid];\r
-\r
- T reg = sdata[tid + 16];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 16];\r
- }\r
-\r
- reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 8];\r
- }\r
-\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
+ data[tid] = partial_reduction;\r
+ \r
+ if (tid < 16) \r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 1];\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
}\r
}\r
- }\r
- };\r
-\r
- template <> struct PredValWarpReductor<16>\r
- {\r
- template <typename T, typename V, typename Pred> \r
- static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ };\r
+ template <> struct WarpReductor<16>\r
{\r
- if (tid < 8)\r
+ template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- myData = sdata[tid];\r
- myVal = sval[tid];\r
-\r
- T reg = reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 8];\r
- }\r
-\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
+ data[tid] = partial_reduction;\r
+ \r
+ if (tid < 8) \r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 1];\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
}\r
}\r
- }\r
- };\r
- template <> struct PredValWarpReductor<8>\r
- {\r
- template <typename T, typename V, typename Pred> \r
- static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ };\r
+ template <> struct WarpReductor<8>\r
{\r
- if (tid < 4)\r
+ template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- myData = sdata[tid];\r
- myVal = sval[tid];\r
-\r
- T reg = reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
+ data[tid] = partial_reduction;\r
+ \r
+ if (tid < 4) \r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 1];\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
}\r
}\r
- }\r
- };\r
+ };\r
\r
- template <bool warp> struct PredValReductionDispatcher;\r
- template <> struct PredValReductionDispatcher<true>\r
- {\r
- template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
- {\r
- PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);\r
- }\r
- };\r
- template <> struct PredValReductionDispatcher<false>\r
- {\r
- template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ template <bool warp> struct ReductionDispatcher;\r
+ template <> struct ReductionDispatcher<true>\r
{\r
- myData = sdata[tid];\r
- myVal = sval[tid];\r
-\r
- if (n >= 512 && tid < 256) \r
+ template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- T reg = sdata[tid + 256];\r
-\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 256];\r
- }\r
- __syncthreads(); \r
+ WarpReductor<n>::reduce(data, partial_reduction, tid, op);\r
}\r
- if (n >= 256 && tid < 128) \r
+ };\r
+ template <> struct ReductionDispatcher<false>\r
+ {\r
+ template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- T reg = sdata[tid + 128];\r
+ if (tid < n)\r
+ data[tid] = partial_reduction;\r
+ __syncthreads();\r
+\r
+\r
+ if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }\r
+ if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }\r
+ if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }\r
\r
- if (pred(reg, myData))\r
+ if (tid < 32)\r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 128];\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);\r
+ data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);\r
}\r
- __syncthreads(); \r
}\r
- if (n >= 128 && tid < 64) \r
- {\r
- T reg = sdata[tid + 64];\r
+ };\r
\r
- if (pred(reg, myData))\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // PredValWarpReductor\r
+ \r
+ template <int n> struct PredValWarpReductor;\r
+ template <> struct PredValWarpReductor<64>\r
+ {\r
+ template <typename T, typename V, typename Pred> \r
+ static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ {\r
+ if (tid < 32)\r
{\r
- sdata[tid] = myData = reg;\r
- sval[tid] = myVal = sval[tid + 64];\r
- }\r
- __syncthreads(); \r
- } \r
+ myData = sdata[tid];\r
+ myVal = sval[tid];\r
\r
- if (tid < 32)\r
- {\r
- if (n >= 64) \r
- { \r
T reg = sdata[tid + 32];\r
-\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval[tid] = myVal = sval[tid + 32];\r
}\r
- }\r
- if (n >= 32) \r
- { \r
- T reg = sdata[tid + 16];\r
\r
+ reg = sdata[tid + 16];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval[tid] = myVal = sval[tid + 16];\r
}\r
- }\r
- if (n >= 16) \r
- { \r
- T reg = sdata[tid + 8];\r
\r
+ reg = sdata[tid + 8];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval[tid] = myVal = sval[tid + 8];\r
}\r
- }\r
- if (n >= 8) \r
- { \r
- T reg = sdata[tid + 4];\r
\r
+ reg = sdata[tid + 4];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval[tid] = myVal = sval[tid + 4];\r
}\r
- }\r
- if (n >= 4) \r
- { \r
- T reg = sdata[tid + 2];\r
-\r
+ \r
+ reg = sdata[tid + 2];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval[tid] = myVal = sval[tid + 2];\r
- } \r
- }\r
- if (n >= 2) \r
- { \r
- T reg = sdata[tid + 1];\r
-\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
}\r
}\r
}\r
- }\r
- };\r
-\r
- ///////////////////////////////////////////////////////////////////////////////\r
- // PredVal2WarpReductor\r
-\r
- template <int n> struct PredVal2WarpReductor;\r
- template <> struct PredVal2WarpReductor<64>\r
- {\r
- template <typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ };\r
+ template <> struct PredValWarpReductor<32>\r
{\r
- if (tid < 32)\r
+ template <typename T, typename V, typename Pred> \r
+ static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
{\r
- myData = sdata[tid];\r
- myVal1 = sval1[tid];\r
- myVal2 = sval2[tid];\r
-\r
- T reg = sdata[tid + 32];\r
- if (pred(reg, myData))\r
+ if (tid < 16)\r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 32];\r
- sval2[tid] = myVal2 = sval2[tid + 32];\r
- }\r
+ myData = sdata[tid];\r
+ myVal = sval[tid];\r
\r
- reg = sdata[tid + 16];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 16];\r
- sval2[tid] = myVal2 = sval2[tid + 16];\r
- }\r
+ T reg = sdata[tid + 16];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 16];\r
+ }\r
\r
- reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 8];\r
- sval2[tid] = myVal2 = sval2[tid + 8];\r
- }\r
+ reg = sdata[tid + 8];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 8];\r
+ }\r
\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 4];\r
- sval2[tid] = myVal2 = sval2[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 2];\r
- sval2[tid] = myVal2 = sval2[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 1];\r
- sval2[tid] = myVal2 = sval2[tid + 1];\r
+ reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 1];\r
+ }\r
}\r
}\r
- }\r
- };\r
- template <> struct PredVal2WarpReductor<32>\r
- {\r
- template <typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ };\r
+\r
+ template <> struct PredValWarpReductor<16>\r
{\r
- if (tid < 16)\r
+ template <typename T, typename V, typename Pred> \r
+ static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
{\r
- myData = sdata[tid];\r
- myVal1 = sval1[tid];\r
- myVal2 = sval2[tid];\r
-\r
- T reg = sdata[tid + 16];\r
- if (pred(reg, myData))\r
+ if (tid < 8)\r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 16];\r
- sval2[tid] = myVal2 = sval2[tid + 16];\r
- }\r
+ myData = sdata[tid];\r
+ myVal = sval[tid];\r
\r
- reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 8];\r
- sval2[tid] = myVal2 = sval2[tid + 8];\r
- }\r
+ T reg = reg = sdata[tid + 8];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 8];\r
+ }\r
\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 4];\r
- sval2[tid] = myVal2 = sval2[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 2];\r
- sval2[tid] = myVal2 = sval2[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 1];\r
- sval2[tid] = myVal2 = sval2[tid + 1];\r
+ reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 1];\r
+ }\r
}\r
}\r
- }\r
- };\r
-\r
- template <> struct PredVal2WarpReductor<16>\r
- {\r
- template <typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ };\r
+ template <> struct PredValWarpReductor<8>\r
{\r
- if (tid < 8)\r
+ template <typename T, typename V, typename Pred> \r
+ static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
{\r
- myData = sdata[tid];\r
- myVal1 = sval1[tid];\r
- myVal2 = sval2[tid];\r
-\r
- T reg = reg = sdata[tid + 8];\r
- if (pred(reg, myData))\r
+ if (tid < 4)\r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 8];\r
- sval2[tid] = myVal2 = sval2[tid + 8];\r
- }\r
+ myData = sdata[tid];\r
+ myVal = sval[tid];\r
\r
- reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 4];\r
- sval2[tid] = myVal2 = sval2[tid + 4];\r
- }\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 2];\r
- sval2[tid] = myVal2 = sval2[tid + 2];\r
- }\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 1];\r
- sval2[tid] = myVal2 = sval2[tid + 1];\r
+ T reg = reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 1];\r
+ }\r
}\r
}\r
- }\r
- };\r
- template <> struct PredVal2WarpReductor<8>\r
- {\r
- template <typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ };\r
+\r
+ template <bool warp> struct PredValReductionDispatcher;\r
+ template <> struct PredValReductionDispatcher<true>\r
{\r
- if (tid < 4)\r
+ template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+ {\r
+ PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);\r
+ }\r
+ };\r
+ template <> struct PredValReductionDispatcher<false>\r
+ {\r
+ template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
{\r
myData = sdata[tid];\r
- myVal1 = sval1[tid];\r
- myVal2 = sval2[tid];\r
+ myVal = sval[tid];\r
\r
- T reg = reg = sdata[tid + 4];\r
- if (pred(reg, myData))\r
+ if (n >= 512 && tid < 256) \r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 4];\r
- sval2[tid] = myVal2 = sval2[tid + 4];\r
+ T reg = sdata[tid + 256];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 256];\r
+ }\r
+ __syncthreads(); \r
}\r
- \r
- reg = sdata[tid + 2];\r
- if (pred(reg, myData))\r
+ if (n >= 256 && tid < 128) \r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 2];\r
- sval2[tid] = myVal2 = sval2[tid + 2];\r
+ T reg = sdata[tid + 128];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 128];\r
+ }\r
+ __syncthreads(); \r
}\r
- \r
- reg = sdata[tid + 1];\r
- if (pred(reg, myData))\r
+ if (n >= 128 && tid < 64) \r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 1];\r
- sval2[tid] = myVal2 = sval2[tid + 1];\r
- }\r
- }\r
- }\r
- };\r
-\r
- template <bool warp> struct PredVal2ReductionDispatcher;\r
- template <> struct PredVal2ReductionDispatcher<true>\r
- {\r
- template <int n, typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
- {\r
- PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
- }\r
- };\r
- template <> struct PredVal2ReductionDispatcher<false>\r
- {\r
- template <int n, typename T, typename V1, typename V2, typename Pred> \r
- static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
- {\r
- myData = sdata[tid];\r
- myVal1 = sval1[tid];\r
- myVal2 = sval2[tid];\r
+ T reg = sdata[tid + 64];\r
\r
- if (n >= 512 && tid < 256) \r
- {\r
- T reg = sdata[tid + 256];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 64];\r
+ }\r
+ __syncthreads(); \r
+ } \r
\r
- if (pred(reg, myData))\r
+ if (tid < 32)\r
{\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 256];\r
- sval2[tid] = myVal2 = sval2[tid + 256];\r
- }\r
- __syncthreads(); \r
- }\r
- if (n >= 256 && tid < 128) \r
- {\r
- T reg = sdata[tid + 128];\r
+ if (n >= 64) \r
+ { \r
+ T reg = sdata[tid + 32];\r
\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 128];\r
- sval2[tid] = myVal2 = sval2[tid + 128];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 32];\r
+ }\r
+ }\r
+ if (n >= 32) \r
+ { \r
+ T reg = sdata[tid + 16];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 16];\r
+ }\r
+ }\r
+ if (n >= 16) \r
+ { \r
+ T reg = sdata[tid + 8];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 8];\r
+ }\r
+ }\r
+ if (n >= 8) \r
+ { \r
+ T reg = sdata[tid + 4];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 4];\r
+ }\r
+ }\r
+ if (n >= 4) \r
+ { \r
+ T reg = sdata[tid + 2];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 2];\r
+ } \r
+ }\r
+ if (n >= 2) \r
+ { \r
+ T reg = sdata[tid + 1];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval[tid] = myVal = sval[tid + 1];\r
+ }\r
+ }\r
}\r
- __syncthreads(); \r
}\r
- if (n >= 128 && tid < 64) \r
- {\r
- T reg = sdata[tid + 64];\r
+ };\r
\r
- if (pred(reg, myData))\r
- {\r
- sdata[tid] = myData = reg;\r
- sval1[tid] = myVal1 = sval1[tid + 64];\r
- sval2[tid] = myVal2 = sval2[tid + 64];\r
- }\r
- __syncthreads(); \r
- } \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // PredVal2WarpReductor\r
\r
- if (tid < 32)\r
+ template <int n> struct PredVal2WarpReductor;\r
+ template <> struct PredVal2WarpReductor<64>\r
+ {\r
+ template <typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
{\r
- if (n >= 64) \r
- { \r
- T reg = sdata[tid + 32];\r
+ if (tid < 32)\r
+ {\r
+ myData = sdata[tid];\r
+ myVal1 = sval1[tid];\r
+ myVal2 = sval2[tid];\r
\r
+ T reg = sdata[tid + 32];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval1[tid] = myVal1 = sval1[tid + 32];\r
sval2[tid] = myVal2 = sval2[tid + 32];\r
}\r
- }\r
- if (n >= 32) \r
- { \r
- T reg = sdata[tid + 16];\r
\r
+ reg = sdata[tid + 16];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval1[tid] = myVal1 = sval1[tid + 16];\r
sval2[tid] = myVal2 = sval2[tid + 16];\r
}\r
- }\r
- if (n >= 16) \r
- { \r
- T reg = sdata[tid + 8];\r
\r
+ reg = sdata[tid + 8];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval1[tid] = myVal1 = sval1[tid + 8];\r
sval2[tid] = myVal2 = sval2[tid + 8];\r
}\r
+\r
+ reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 4];\r
+ sval2[tid] = myVal2 = sval2[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 2];\r
+ sval2[tid] = myVal2 = sval2[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 1];\r
+ sval2[tid] = myVal2 = sval2[tid + 1];\r
+ }\r
}\r
- if (n >= 8) \r
- { \r
- T reg = sdata[tid + 4];\r
+ }\r
+ };\r
+ template <> struct PredVal2WarpReductor<32>\r
+ {\r
+ template <typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ {\r
+ if (tid < 16)\r
+ {\r
+ myData = sdata[tid];\r
+ myVal1 = sval1[tid];\r
+ myVal2 = sval2[tid];\r
+\r
+ T reg = sdata[tid + 16];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 16];\r
+ sval2[tid] = myVal2 = sval2[tid + 16];\r
+ }\r
+\r
+ reg = sdata[tid + 8];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 8];\r
+ sval2[tid] = myVal2 = sval2[tid + 8];\r
+ }\r
\r
+ reg = sdata[tid + 4];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval1[tid] = myVal1 = sval1[tid + 4];\r
sval2[tid] = myVal2 = sval2[tid + 4];\r
}\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 2];\r
+ sval2[tid] = myVal2 = sval2[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 1];\r
+ sval2[tid] = myVal2 = sval2[tid + 1];\r
+ }\r
}\r
- if (n >= 4) \r
- { \r
- T reg = sdata[tid + 2];\r
+ }\r
+ };\r
+\r
+ template <> struct PredVal2WarpReductor<16>\r
+ {\r
+ template <typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ {\r
+ if (tid < 8)\r
+ {\r
+ myData = sdata[tid];\r
+ myVal1 = sval1[tid];\r
+ myVal2 = sval2[tid];\r
+\r
+ T reg = reg = sdata[tid + 8];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 8];\r
+ sval2[tid] = myVal2 = sval2[tid + 8];\r
+ }\r
\r
+ reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 4];\r
+ sval2[tid] = myVal2 = sval2[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
sval1[tid] = myVal1 = sval1[tid + 2];\r
sval2[tid] = myVal2 = sval2[tid + 2];\r
- } \r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 1];\r
+ sval2[tid] = myVal2 = sval2[tid + 1];\r
+ }\r
}\r
- if (n >= 2) \r
- { \r
- T reg = sdata[tid + 1];\r
+ }\r
+ };\r
+ template <> struct PredVal2WarpReductor<8>\r
+ {\r
+ template <typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ {\r
+ if (tid < 4)\r
+ {\r
+ myData = sdata[tid];\r
+ myVal1 = sval1[tid];\r
+ myVal2 = sval2[tid];\r
\r
+ T reg = reg = sdata[tid + 4];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 4];\r
+ sval2[tid] = myVal2 = sval2[tid + 4];\r
+ }\r
+ \r
+ reg = sdata[tid + 2];\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 2];\r
+ sval2[tid] = myVal2 = sval2[tid + 2];\r
+ }\r
+ \r
+ reg = sdata[tid + 1];\r
if (pred(reg, myData))\r
{\r
sdata[tid] = myData = reg;\r
}\r
}\r
}\r
- }\r
- };\r
-}\r
+ };\r
+\r
+ template <bool warp> struct PredVal2ReductionDispatcher;\r
+ template <> struct PredVal2ReductionDispatcher<true>\r
+ {\r
+ template <int n, typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ {\r
+ PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
+ }\r
+ };\r
+ template <> struct PredVal2ReductionDispatcher<false>\r
+ {\r
+ template <int n, typename T, typename V1, typename V2, typename Pred> \r
+ static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+ {\r
+ myData = sdata[tid];\r
+ myVal1 = sval1[tid];\r
+ myVal2 = sval2[tid];\r
+\r
+ if (n >= 512 && tid < 256) \r
+ {\r
+ T reg = sdata[tid + 256];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 256];\r
+ sval2[tid] = myVal2 = sval2[tid + 256];\r
+ }\r
+ __syncthreads(); \r
+ }\r
+ if (n >= 256 && tid < 128) \r
+ {\r
+ T reg = sdata[tid + 128];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 128];\r
+ sval2[tid] = myVal2 = sval2[tid + 128];\r
+ }\r
+ __syncthreads(); \r
+ }\r
+ if (n >= 128 && tid < 64) \r
+ {\r
+ T reg = sdata[tid + 64];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 64];\r
+ sval2[tid] = myVal2 = sval2[tid + 64];\r
+ }\r
+ __syncthreads(); \r
+ } \r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ if (tid < 32)\r
+ {\r
+ if (n >= 64) \r
+ { \r
+ T reg = sdata[tid + 32];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 32];\r
+ sval2[tid] = myVal2 = sval2[tid + 32];\r
+ }\r
+ }\r
+ if (n >= 32) \r
+ { \r
+ T reg = sdata[tid + 16];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 16];\r
+ sval2[tid] = myVal2 = sval2[tid + 16];\r
+ }\r
+ }\r
+ if (n >= 16) \r
+ { \r
+ T reg = sdata[tid + 8];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 8];\r
+ sval2[tid] = myVal2 = sval2[tid + 8];\r
+ }\r
+ }\r
+ if (n >= 8) \r
+ { \r
+ T reg = sdata[tid + 4];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 4];\r
+ sval2[tid] = myVal2 = sval2[tid + 4];\r
+ }\r
+ }\r
+ if (n >= 4) \r
+ { \r
+ T reg = sdata[tid + 2];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 2];\r
+ sval2[tid] = myVal2 = sval2[tid + 2];\r
+ } \r
+ }\r
+ if (n >= 2) \r
+ { \r
+ T reg = sdata[tid + 1];\r
+\r
+ if (pred(reg, myData))\r
+ {\r
+ sdata[tid] = myData = reg;\r
+ sval1[tid] = myVal1 = sval1[tid + 1];\r
+ sval2[tid] = myVal2 = sval2[tid + 1];\r
+ }\r
+ }\r
+ }\r
+ }\r
+ };\r
+ } // namespace utility_detail\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__\r
#include "internal_shared.hpp"\r
#include "../datamov_utils.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <int THREAD_DIM, int N> struct UnrollVecDiffCached\r
+ namespace vec_distance_detail\r
{\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)\r
+ template <int THREAD_DIM, int N> struct UnrollVecDiffCached\r
{\r
- if (ind < len)\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)\r
{\r
- T1 val1 = *vecCached++;\r
+ if (ind < len)\r
+ {\r
+ T1 val1 = *vecCached++;\r
\r
- T2 val2;\r
- ForceGlob<T2>::Load(vecGlob, ind, val2);\r
+ T2 val2;\r
+ ForceGlob<T2>::Load(vecGlob, ind, val2);\r
\r
- dist.reduceIter(val1, val2);\r
+ dist.reduceIter(val1, val2);\r
\r
- UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);\r
+ UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);\r
+ }\r
}\r
- }\r
\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)\r
- {\r
- T1 val1 = *vecCached++;\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)\r
+ {\r
+ T1 val1 = *vecCached++;\r
\r
- T2 val2;\r
- ForceGlob<T2>::Load(vecGlob, 0, val2);\r
- vecGlob += THREAD_DIM;\r
+ T2 val2;\r
+ ForceGlob<T2>::Load(vecGlob, 0, val2);\r
+ vecGlob += THREAD_DIM;\r
\r
- dist.reduceIter(val1, val2);\r
+ dist.reduceIter(val1, val2);\r
\r
- UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);\r
- }\r
- };\r
- template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>\r
- {\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)\r
+ UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);\r
+ }\r
+ };\r
+ template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>\r
{\r
- }\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)\r
+ {\r
+ }\r
\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)\r
- {\r
- }\r
- };\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)\r
+ {\r
+ }\r
+ };\r
\r
- template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;\r
- template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>\r
- {\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
+ template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;\r
+ template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>\r
{\r
- UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);\r
- }\r
- };\r
- template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>\r
- {\r
- template <typename Dist, typename T1, typename T2>\r
- static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
+ {\r
+ UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);\r
+ }\r
+ };\r
+ template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>\r
{\r
- UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);\r
- }\r
- };\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename Dist, typename T1, typename T2>\r
+ static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
+ {\r
+ UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);\r
+ }\r
+ };\r
+ } // namespace vec_distance_detail\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__\r
#include "internal_shared.hpp"\r
#include "warp_reduce.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-struct Emulation\r
+namespace cv { namespace gpu { namespace device \r
{\r
- static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
- {\r
-#if __CUDA_ARCH__ >= 200\r
- (void)cta_buffer;\r
- return __ballot(predicate);\r
-#else\r
- int tid = threadIdx.x; \r
- cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
- return warp_reduce(cta_buffer);\r
-#endif\r
- }\r
-};\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ struct Emulation\r
+ {\r
+ static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
+ {\r
+ #if __CUDA_ARCH__ >= 200\r
+ (void)cta_buffer;\r
+ return __ballot(predicate);\r
+ #else\r
+ int tid = threadIdx.x; \r
+ cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
+ return warp_reduce(cta_buffer);\r
+ #endif\r
+ }\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif /* OPENCV_GPU_EMULATION_HPP_ */
\ No newline at end of file
#include "vec_traits.hpp"\r
#include "vec_math.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename Ptr2D> struct PointFilter\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef typename Ptr2D::elem_type elem_type;\r
- typedef float index_type;\r
-\r
- explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}\r
- \r
- __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
+ template <typename Ptr2D> struct PointFilter\r
{\r
- return src(__float2int_rn(y), __float2int_rn(x));\r
- }\r
+ typedef typename Ptr2D::elem_type elem_type;\r
+ typedef float index_type;\r
\r
- const Ptr2D src;\r
-};\r
-\r
-template <typename Ptr2D> struct LinearFilter\r
-{\r
- typedef typename Ptr2D::elem_type elem_type;\r
- typedef float index_type;\r
+ explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}\r
+ \r
+ __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
+ {\r
+ return src(__float2int_rn(y), __float2int_rn(x));\r
+ }\r
\r
- explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}\r
+ const Ptr2D src;\r
+ };\r
\r
- __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
+ template <typename Ptr2D> struct LinearFilter\r
{\r
- typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
+ typedef typename Ptr2D::elem_type elem_type;\r
+ typedef float index_type;\r
\r
- work_type out = VecTraits<work_type>::all(0);\r
+ explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}\r
\r
- const int x1 = __float2int_rd(x);\r
- const int y1 = __float2int_rd(y);\r
- const int x2 = x1 + 1;\r
- const int y2 = y1 + 1;\r
+ __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
+ {\r
+ typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
\r
- elem_type src_reg = src(y1, x1);\r
- out = out + src_reg * ((x2 - x) * (y2 - y));\r
+ work_type out = VecTraits<work_type>::all(0);\r
\r
- src_reg = src(y1, x2);\r
- out = out + src_reg * ((x - x1) * (y2 - y));\r
+ const int x1 = __float2int_rd(x);\r
+ const int y1 = __float2int_rd(y);\r
+ const int x2 = x1 + 1;\r
+ const int y2 = y1 + 1;\r
\r
- src_reg = src(y2, x1);\r
- out = out + src_reg * ((x2 - x) * (y - y1));\r
+ elem_type src_reg = src(y1, x1);\r
+ out = out + src_reg * ((x2 - x) * (y2 - y));\r
\r
- src_reg = src(y2, x2);\r
- out = out + src_reg * ((x - x1) * (y - y1));\r
+ src_reg = src(y1, x2);\r
+ out = out + src_reg * ((x - x1) * (y2 - y));\r
\r
- return saturate_cast<elem_type>(out);\r
- }\r
+ src_reg = src(y2, x1);\r
+ out = out + src_reg * ((x2 - x) * (y - y1));\r
\r
- const Ptr2D src;\r
-};\r
+ src_reg = src(y2, x2);\r
+ out = out + src_reg * ((x - x1) * (y - y1));\r
\r
-template <typename Ptr2D> struct CubicFilter\r
-{\r
- typedef typename Ptr2D::elem_type elem_type;\r
- typedef float index_type;\r
- typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
+ return saturate_cast<elem_type>(out);\r
+ }\r
\r
- explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}\r
- \r
- static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) \r
- {\r
- return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));\r
- }\r
+ const Ptr2D src;\r
+ };\r
\r
- __device__ elem_type operator ()(float y, float x) const\r
+ template <typename Ptr2D> struct CubicFilter\r
{\r
- const int xi = __float2int_rn(x);\r
- const int yi = __float2int_rn(y);\r
- \r
- work_type arr[4];\r
- \r
- arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);\r
- arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi , xi - 1)), saturate_cast<work_type>(src(yi , xi)), saturate_cast<work_type>(src(yi , xi + 1)), saturate_cast<work_type>(src(yi , xi + 2)), x - xi);\r
- arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);\r
- arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);\r
- \r
- return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));\r
- }\r
-\r
- const Ptr2D src;\r
-};\r
+ typedef typename Ptr2D::elem_type elem_type;\r
+ typedef float index_type;\r
+ typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}\r
+ \r
+ static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) \r
+ {\r
+ return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));\r
+ }\r
+\r
+ __device__ elem_type operator ()(float y, float x) const\r
+ {\r
+ const int xi = __float2int_rn(x);\r
+ const int yi = __float2int_rn(y);\r
+ \r
+ work_type arr[4];\r
+ \r
+ arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);\r
+ arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi , xi - 1)), saturate_cast<work_type>(src(yi , xi)), saturate_cast<work_type>(src(yi , xi + 1)), saturate_cast<work_type>(src(yi , xi + 2)), x - xi);\r
+ arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);\r
+ arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);\r
+ \r
+ return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));\r
+ }\r
+\r
+ const Ptr2D src;\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_FILTERS_HPP__\r
#include <cstdio>\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template<class Func> \r
-void printFuncAttrib(Func& func)\r
+namespace cv { namespace gpu { namespace device \r
{\r
+ template<class Func> \r
+ void printFuncAttrib(Func& func)\r
+ {\r
\r
- cudaFuncAttributes attrs;\r
- cudaFuncGetAttributes(&attrs, func); \r
-\r
- printf("=== Function stats ===\n");\r
- printf("Name: \n");\r
- printf("sharedSizeBytes = %d\n", attrs.sharedSizeBytes);\r
- printf("constSizeBytes = %d\n", attrs.constSizeBytes);\r
- printf("localSizeBytes = %d\n", attrs.localSizeBytes);\r
- printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);\r
- printf("numRegs = %d\n", attrs.numRegs);\r
- printf("ptxVersion = %d\n", attrs.ptxVersion);\r
- printf("binaryVersion = %d\n", attrs.binaryVersion);\r
- printf("\n");\r
- fflush(stdout); \r
-}\r
+ cudaFuncAttributes attrs;\r
+ cudaFuncGetAttributes(&attrs, func); \r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ printf("=== Function stats ===\n");\r
+ printf("Name: \n");\r
+ printf("sharedSizeBytes = %d\n", attrs.sharedSizeBytes);\r
+ printf("constSizeBytes = %d\n", attrs.constSizeBytes);\r
+ printf("localSizeBytes = %d\n", attrs.localSizeBytes);\r
+ printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);\r
+ printf("numRegs = %d\n", attrs.numRegs);\r
+ printf("ptxVersion = %d\n", attrs.ptxVersion);\r
+ printf("binaryVersion = %d\n", attrs.binaryVersion);\r
+ printf("\n");\r
+ fflush(stdout); \r
+ }\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
\ No newline at end of file
#include "vec_traits.hpp"\r
#include "type_traits.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-// Function Objects\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ // Function Objects\r
\r
-using thrust::unary_function;\r
-using thrust::binary_function;\r
+ using thrust::unary_function;\r
+ using thrust::binary_function;\r
\r
-// Arithmetic Operations\r
+ // Arithmetic Operations\r
\r
-template <typename T> struct plus : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ template <typename T> struct plus : binary_function<T, T, T>\r
{\r
- return a + b;\r
- }\r
-};\r
-template <typename T> struct minus : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a + b;\r
+ }\r
+ };\r
+ template <typename T> struct minus : binary_function<T, T, T>\r
{\r
- return a - b;\r
- }\r
-};\r
-template <typename T> struct multiplies : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a - b;\r
+ }\r
+ };\r
+ template <typename T> struct multiplies : binary_function<T, T, T>\r
{\r
- return a * b;\r
- }\r
-};\r
-template <typename T> struct divides : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a * b;\r
+ }\r
+ };\r
+ template <typename T> struct divides : binary_function<T, T, T>\r
{\r
- return a / b;\r
- }\r
-};\r
-template <typename T> struct modulus : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a / b;\r
+ }\r
+ };\r
+ template <typename T> struct modulus : binary_function<T, T, T>\r
{\r
- return a % b;\r
- }\r
-};\r
-template <typename T> struct negate : unary_function<T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a % b;\r
+ }\r
+ };\r
+ template <typename T> struct negate : unary_function<T, T>\r
{\r
- return -a;\r
- }\r
-};\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const\r
+ {\r
+ return -a;\r
+ }\r
+ };\r
\r
-// Comparison Operations\r
+ // Comparison Operations\r
\r
-template <typename T> struct equal_to : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ template <typename T> struct equal_to : binary_function<T, T, bool>\r
{\r
- return a == b;\r
- }\r
-};\r
-template <typename T> struct not_equal_to : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a == b;\r
+ }\r
+ };\r
+ template <typename T> struct not_equal_to : binary_function<T, T, bool>\r
{\r
- return a != b;\r
- }\r
-};\r
-template <typename T> struct greater : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a != b;\r
+ }\r
+ };\r
+ template <typename T> struct greater : binary_function<T, T, bool>\r
{\r
- return a > b;\r
- }\r
-};\r
-template <typename T> struct less : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a > b;\r
+ }\r
+ };\r
+ template <typename T> struct less : binary_function<T, T, bool>\r
{\r
- return a < b;\r
- }\r
-};\r
-template <typename T> struct greater_equal : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a < b;\r
+ }\r
+ };\r
+ template <typename T> struct greater_equal : binary_function<T, T, bool>\r
{\r
- return a >= b;\r
- }\r
-};\r
-template <typename T> struct less_equal : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a >= b;\r
+ }\r
+ };\r
+ template <typename T> struct less_equal : binary_function<T, T, bool>\r
{\r
- return a <= b;\r
- }\r
-};\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a <= b;\r
+ }\r
+ };\r
\r
-// Logical Operations\r
+ // Logical Operations\r
\r
-template <typename T> struct logical_and : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ template <typename T> struct logical_and : binary_function<T, T, bool>\r
{\r
- return a && b;\r
- }\r
-};\r
-template <typename T> struct logical_or : binary_function<T, T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a && b;\r
+ }\r
+ };\r
+ template <typename T> struct logical_or : binary_function<T, T, bool>\r
{\r
- return a || b;\r
- }\r
-};\r
-template <typename T> struct logical_not : unary_function<T, bool>\r
-{\r
- __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a || b;\r
+ }\r
+ };\r
+ template <typename T> struct logical_not : unary_function<T, bool>\r
{\r
- return !a;\r
- }\r
-};\r
+ __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const\r
+ {\r
+ return !a;\r
+ }\r
+ };\r
\r
-// Bitwise Operations\r
+ // Bitwise Operations\r
\r
-template <typename T> struct bit_and : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ template <typename T> struct bit_and : binary_function<T, T, T>\r
{\r
- return a & b;\r
- }\r
-};\r
-template <typename T> struct bit_or : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a & b;\r
+ }\r
+ };\r
+ template <typename T> struct bit_or : binary_function<T, T, T>\r
{\r
- return a | b;\r
- }\r
-};\r
-template <typename T> struct bit_xor : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a | b;\r
+ }\r
+ };\r
+ template <typename T> struct bit_xor : binary_function<T, T, T>\r
{\r
- return a ^ b;\r
- }\r
-};\r
-template <typename T> struct bit_not : unary_function<T, T>\r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const \r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
+ {\r
+ return a ^ b;\r
+ }\r
+ };\r
+ template <typename T> struct bit_not : unary_function<T, T>\r
{\r
- return ~v;\r
- }\r
-};\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const \r
+ {\r
+ return ~v;\r
+ }\r
+ };\r
\r
-// Generalized Identity Operations\r
+ // Generalized Identity Operations\r
\r
-template <typename T> struct identity : unary_function<T, T>\r
-{\r
- __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const \r
+ template <typename T> struct identity : unary_function<T, T>\r
{\r
- return x;\r
- }\r
-};\r
+ __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const \r
+ {\r
+ return x;\r
+ }\r
+ };\r
\r
-template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>\r
-{\r
- __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>\r
{\r
- return lhs;\r
- }\r
-};\r
-template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>\r
-{\r
- __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ {\r
+ return lhs;\r
+ }\r
+ };\r
+ template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>\r
{\r
- return rhs;\r
- }\r
-};\r
+ __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
+ {\r
+ return rhs;\r
+ }\r
+ };\r
\r
// Min/Max Operations\r
\r
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \\r
};\r
\r
-template <typename T> struct maximum : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+ template <typename T> struct maximum : binary_function<T, T, T>\r
{\r
- return lhs < rhs ? rhs : lhs;\r
- }\r
-};\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)\r
-\r
-template <typename T> struct minimum : binary_function<T, T, T>\r
-{\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
- {\r
- return lhs < rhs ? lhs : rhs;\r
- }\r
-};\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)\r
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+ {\r
+ return lhs < rhs ? rhs : lhs;\r
+ }\r
+ };\r
+\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)\r
+\r
+ template <typename T> struct minimum : binary_function<T, T, T>\r
+ {\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+ {\r
+ return lhs < rhs ? lhs : rhs;\r
+ }\r
+ };\r
+\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)\r
+ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)\r
\r
#undef OPENCV_GPU_IMPLEMENT_MINMAX\r
\r
return func(v); \\r
} \\r
};\r
+\r
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \\r
template <typename T> struct name ## _func : binary_function<T, T, float> \\r
{ \\r
} \\r
};\r
\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)\r
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)\r
-\r
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)\r
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)\r
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR\r
-#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR\r
-\r
-template<typename T> struct hypot_sqr_func : binary_function<T, T, float> \r
-{\r
- __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const\r
- {\r
- return src1 * src1 + src2 * src2;\r
- }\r
-};\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)\r
+ OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)\r
+\r
+ OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)\r
+ OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)\r
+ OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)\r
+\r
+ #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR\r
+ #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR\r
+\r
+ template<typename T> struct hypot_sqr_func : binary_function<T, T, float> \r
+ {\r
+ __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const\r
+ {\r
+ return src1 * src1 + src2 * src2;\r
+ }\r
+ };\r
\r
-// Saturate Cast Functor\r
+ // Saturate Cast Functor\r
\r
-template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>\r
-{\r
- __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const\r
+ template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>\r
{\r
- return saturate_cast<D>(v);\r
- }\r
-};\r
-\r
-// Threshold Functors\r
+ __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const\r
+ {\r
+ return saturate_cast<D>(v);\r
+ }\r
+ };\r
\r
-template <typename T> struct thresh_binary_func : unary_function<T, T>\r
-{\r
- __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
+ // Threshold Functors\r
\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ template <typename T> struct thresh_binary_func : unary_function<T, T>\r
{\r
- return (src > thresh) * maxVal;\r
- }\r
+ __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
\r
- const T thresh;\r
- const T maxVal;\r
-};\r
-template <typename T> struct thresh_binary_inv_func : unary_function<T, T>\r
-{\r
- __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ {\r
+ return (src > thresh) * maxVal;\r
+ }\r
\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ const T thresh;\r
+ const T maxVal;\r
+ };\r
+ template <typename T> struct thresh_binary_inv_func : unary_function<T, T>\r
{\r
- return (src <= thresh) * maxVal;\r
- }\r
+ __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
\r
- const T thresh;\r
- const T maxVal;\r
-};\r
-template <typename T> struct thresh_trunc_func : unary_function<T, T>\r
-{\r
- explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ {\r
+ return (src <= thresh) * maxVal;\r
+ }\r
\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ const T thresh;\r
+ const T maxVal;\r
+ };\r
+ template <typename T> struct thresh_trunc_func : unary_function<T, T>\r
{\r
- return minimum<T>()(src, thresh);\r
- }\r
+ explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
\r
- const T thresh;\r
-};\r
-template <typename T> struct thresh_to_zero_func : unary_function<T, T>\r
-{\r
- explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ {\r
+ return minimum<T>()(src, thresh);\r
+ }\r
\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ const T thresh;\r
+ };\r
+ template <typename T> struct thresh_to_zero_func : unary_function<T, T>\r
{\r
- return (src > thresh) * src;\r
- }\r
+ explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
\r
- const T thresh;\r
-};\r
-template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>\r
-{\r
- explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ {\r
+ return (src > thresh) * src;\r
+ }\r
\r
- __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ const T thresh;\r
+ };\r
+ template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>\r
{\r
- return (src <= thresh) * src;\r
- }\r
+ explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
\r
- const T thresh;\r
-};\r
+ __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
+ {\r
+ return (src <= thresh) * src;\r
+ }\r
\r
-// Function Object Adaptors\r
-\r
-template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>\r
-{\r
- explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}\r
+ const T thresh;\r
+ };\r
\r
- __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const\r
- { \r
- return !pred(x); \r
- }\r
+ // Function Object Adaptors\r
\r
- const Predicate pred;\r
-};\r
-template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)\r
-{\r
- return unary_negate<Predicate>(pred);\r
-}\r
+ template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>\r
+ {\r
+ explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}\r
\r
-template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>\r
-{\r
- explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}\r
+ __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const\r
+ { \r
+ return !pred(x); \r
+ }\r
\r
- __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
- { \r
- return !pred(x,y); \r
+ const Predicate pred;\r
+ };\r
+ template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)\r
+ {\r
+ return unary_negate<Predicate>(pred);\r
}\r
\r
- const Predicate pred;\r
-};\r
-template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)\r
-{\r
- return binary_negate<BinaryPredicate>(pred);\r
-}\r
+ template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>\r
+ {\r
+ explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}\r
\r
-template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> \r
-{\r
- __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}\r
+ __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
+ { \r
+ return !pred(x,y); \r
+ }\r
\r
- __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const\r
+ const Predicate pred;\r
+ };\r
+ template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)\r
{\r
- return op(arg1, a);\r
+ return binary_negate<BinaryPredicate>(pred);\r
}\r
\r
- const Op op;\r
- const typename Op::first_argument_type arg1;\r
-};\r
-template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)\r
-{\r
- return binder1st<Op>(op, typename Op::first_argument_type(x));\r
-}\r
+ template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> \r
+ {\r
+ __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}\r
\r
-template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> \r
-{\r
- __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}\r
+ __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const\r
+ {\r
+ return op(arg1, a);\r
+ }\r
\r
- __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const\r
+ const Op op;\r
+ const typename Op::first_argument_type arg1;\r
+ };\r
+ template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)\r
{\r
- return op(a, arg2);\r
+ return binder1st<Op>(op, typename Op::first_argument_type(x));\r
}\r
\r
- const Op op;\r
- const typename Op::second_argument_type arg2;\r
-};\r
-template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)\r
-{\r
- return binder2nd<Op>(op, typename Op::second_argument_type(x));\r
-}\r
+ template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> \r
+ {\r
+ __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}\r
\r
-// Functor Traits\r
+ __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const\r
+ {\r
+ return op(a, arg2);\r
+ }\r
\r
-template <typename F> struct IsUnaryFunction\r
-{\r
- typedef char Yes;\r
- struct No {Yes a[2];};\r
+ const Op op;\r
+ const typename Op::second_argument_type arg2;\r
+ };\r
+ template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)\r
+ {\r
+ return binder2nd<Op>(op, typename Op::second_argument_type(x));\r
+ }\r
\r
- template <typename T, typename D> static Yes check(unary_function<T, D>);\r
- static No check(...);\r
+ // Functor Traits\r
\r
- static F makeF();\r
+ template <typename F> struct IsUnaryFunction\r
+ {\r
+ typedef char Yes;\r
+ struct No {Yes a[2];};\r
\r
- enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
-};\r
+ template <typename T, typename D> static Yes check(unary_function<T, D>);\r
+ static No check(...);\r
\r
-template <typename F> struct IsBinaryFunction\r
-{\r
- typedef char Yes;\r
- struct No {Yes a[2];};\r
+ static F makeF();\r
\r
- template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);\r
- static No check(...);\r
+ enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
+ };\r
\r
- static F makeF();\r
+ template <typename F> struct IsBinaryFunction\r
+ {\r
+ typedef char Yes;\r
+ struct No {Yes a[2];};\r
\r
- enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
-};\r
+ template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);\r
+ static No check(...);\r
\r
-namespace detail\r
-{\r
- template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };\r
- template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };\r
- template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };\r
+ static F makeF();\r
\r
- template <typename T, typename D> struct DefaultUnaryShift\r
- {\r
- enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };\r
+ enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
};\r
- \r
- template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };\r
- template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };\r
- template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };\r
\r
- template <typename T1, typename T2, typename D> struct DefaultBinaryShift\r
+ namespace functional_detail\r
{\r
- enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };\r
- };\r
+ template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };\r
+ template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };\r
+ template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };\r
\r
- template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;\r
- template <typename Func> struct ShiftDispatcher<Func, true>\r
- {\r
- enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };\r
- };\r
- template <typename Func> struct ShiftDispatcher<Func, false>\r
+ template <typename T, typename D> struct DefaultUnaryShift\r
+ {\r
+ enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };\r
+ };\r
+ \r
+ template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };\r
+ template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };\r
+ template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };\r
+\r
+ template <typename T1, typename T2, typename D> struct DefaultBinaryShift\r
+ {\r
+ enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };\r
+ };\r
+\r
+ template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;\r
+ template <typename Func> struct ShiftDispatcher<Func, true>\r
+ {\r
+ enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };\r
+ };\r
+ template <typename Func> struct ShiftDispatcher<Func, false>\r
+ {\r
+ enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };\r
+ };\r
+ }\r
+\r
+ template <typename Func> struct DefaultTransformShift\r
{\r
- enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };\r
+ enum { shift = functional_detail::ShiftDispatcher<Func>::shift };\r
};\r
-}\r
-\r
-template <typename Func> struct DefaultTransformShift\r
-{\r
- enum { shift = detail::ShiftDispatcher<Func>::shift };\r
-};\r
\r
-template <typename Func> struct DefaultTransformFunctorTraits\r
-{\r
- enum { simple_block_dim_x = 16 };\r
- enum { simple_block_dim_y = 16 };\r
+ template <typename Func> struct DefaultTransformFunctorTraits\r
+ {\r
+ enum { simple_block_dim_x = 16 };\r
+ enum { simple_block_dim_y = 16 };\r
\r
- enum { smart_block_dim_x = 16 };\r
- enum { smart_block_dim_y = 16 };\r
- enum { smart_shift = DefaultTransformShift<Func>::shift };\r
-};\r
+ enum { smart_block_dim_x = 16 };\r
+ enum { smart_block_dim_y = 16 };\r
+ enum { smart_shift = DefaultTransformShift<Func>::shift };\r
+ };\r
\r
-template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};\r
+ template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};\r
\r
-#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \\r
+#define OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(type) \\r
template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_FUNCTIONAL_HPP__\r
#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__\r
#define __OPENCV_GPU_LIMITS_GPU_HPP__\r
\r
+#include <limits>\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template<class T> struct numeric_limits\r
-{\r
- typedef T type;\r
- __device__ __forceinline__ static type min() { return type(); };\r
- __device__ __forceinline__ static type max() { return type(); };\r
- __device__ __forceinline__ static type epsilon() { return type(); }\r
- __device__ __forceinline__ static type round_error() { return type(); }\r
- __device__ __forceinline__ static type denorm_min() { return type(); }\r
- __device__ __forceinline__ static type infinity() { return type(); }\r
- __device__ __forceinline__ static type quiet_NaN() { return type(); }\r
- __device__ __forceinline__ static type signaling_NaN() { return T(); }\r
- static const bool is_signed;\r
-};\r
-\r
-template<> struct numeric_limits<bool>\r
-{\r
- typedef bool type;\r
- __device__ __forceinline__ static type min() { return false; };\r
- __device__ __forceinline__ static type max() { return true; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = false;\r
-};\r
-\r
-template<> struct numeric_limits<char>\r
-{\r
- typedef char type;\r
- __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
- __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = (char)-1 == -1;\r
-};\r
-\r
- template<> struct numeric_limits<signed char>\r
-{\r
- typedef char type;\r
- __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
- __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = (signed char)-1 == -1;\r
-};\r
-\r
-template<> struct numeric_limits<unsigned char>\r
-{\r
- typedef unsigned char type;\r
- __device__ __forceinline__ static type min() { return 0; };\r
- __device__ __forceinline__ static type max() { return UCHAR_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = false;\r
-};\r
-\r
-template<> struct numeric_limits<short>\r
-{\r
- typedef short type;\r
- __device__ __forceinline__ static type min() { return SHRT_MIN; };\r
- __device__ __forceinline__ static type max() { return SHRT_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = true;\r
-};\r
-\r
-template<> struct numeric_limits<unsigned short>\r
-{\r
- typedef unsigned short type;\r
- __device__ __forceinline__ static type min() { return 0; };\r
- __device__ __forceinline__ static type max() { return USHRT_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = false;\r
-};\r
-\r
-template<> struct numeric_limits<int>\r
-{\r
- typedef int type;\r
- __device__ __forceinline__ static type min() { return INT_MIN; };\r
- __device__ __forceinline__ static type max() { return INT_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = true;\r
-};\r
-\r
-\r
-template<> struct numeric_limits<unsigned int>\r
-{\r
- typedef unsigned int type;\r
- __device__ __forceinline__ static type min() { return 0; };\r
- __device__ __forceinline__ static type max() { return UINT_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = false;\r
-};\r
-\r
-template<> struct numeric_limits<long>\r
-{\r
- typedef long type;\r
- __device__ __forceinline__ static type min() { return LONG_MIN; };\r
- __device__ __forceinline__ static type max() { return LONG_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = true;\r
-};\r
-\r
-template<> struct numeric_limits<unsigned long>\r
-{\r
- typedef unsigned long type;\r
- __device__ __forceinline__ static type min() { return 0; };\r
- __device__ __forceinline__ static type max() { return ULONG_MAX; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = false;\r
-};\r
-\r
-template<> struct numeric_limits<float>\r
-{\r
- typedef float type;\r
- __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };\r
- __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };\r
- __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = true;\r
-};\r
-\r
-template<> struct numeric_limits<double>\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef double type;\r
- __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };\r
- __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };\r
- __device__ __forceinline__ static type epsilon();\r
- __device__ __forceinline__ static type round_error();\r
- __device__ __forceinline__ static type denorm_min();\r
- __device__ __forceinline__ static type infinity();\r
- __device__ __forceinline__ static type quiet_NaN();\r
- __device__ __forceinline__ static type signaling_NaN();\r
- static const bool is_signed = true;\r
-};\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template<class T> struct numeric_limits\r
+ {\r
+ typedef T type;\r
+ __device__ __forceinline__ static type min() { return type(); };\r
+ __device__ __forceinline__ static type max() { return type(); };\r
+ __device__ __forceinline__ static type epsilon() { return type(); }\r
+ __device__ __forceinline__ static type round_error() { return type(); }\r
+ __device__ __forceinline__ static type denorm_min() { return type(); }\r
+ __device__ __forceinline__ static type infinity() { return type(); }\r
+ __device__ __forceinline__ static type quiet_NaN() { return type(); }\r
+ __device__ __forceinline__ static type signaling_NaN() { return T(); }\r
+ static const bool is_signed;\r
+ };\r
+\r
+ template<> struct numeric_limits<bool>\r
+ {\r
+ typedef bool type;\r
+ __device__ __forceinline__ static type min() { return false; };\r
+ __device__ __forceinline__ static type max() { return true; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = false;\r
+ };\r
+\r
+ template<> struct numeric_limits<char>\r
+ {\r
+ typedef char type;\r
+ __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
+ __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = (char)-1 == -1;\r
+ };\r
+\r
+ template<> struct numeric_limits<signed char>\r
+ {\r
+ typedef char type;\r
+ __device__ __forceinline__ static type min() { return SCHAR_MIN; };\r
+ __device__ __forceinline__ static type max() { return SCHAR_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = (signed char)-1 == -1;\r
+ };\r
+\r
+ template<> struct numeric_limits<unsigned char>\r
+ {\r
+ typedef unsigned char type;\r
+ __device__ __forceinline__ static type min() { return 0; };\r
+ __device__ __forceinline__ static type max() { return UCHAR_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = false;\r
+ };\r
+\r
+ template<> struct numeric_limits<short>\r
+ {\r
+ typedef short type;\r
+ __device__ __forceinline__ static type min() { return SHRT_MIN; };\r
+ __device__ __forceinline__ static type max() { return SHRT_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = true;\r
+ };\r
+\r
+ template<> struct numeric_limits<unsigned short>\r
+ {\r
+ typedef unsigned short type;\r
+ __device__ __forceinline__ static type min() { return 0; };\r
+ __device__ __forceinline__ static type max() { return USHRT_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = false;\r
+ };\r
+\r
+ template<> struct numeric_limits<int>\r
+ {\r
+ typedef int type;\r
+ __device__ __forceinline__ static type min() { return INT_MIN; };\r
+ __device__ __forceinline__ static type max() { return INT_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = true;\r
+ };\r
+\r
+\r
+ template<> struct numeric_limits<unsigned int>\r
+ {\r
+ typedef unsigned int type;\r
+ __device__ __forceinline__ static type min() { return 0; };\r
+ __device__ __forceinline__ static type max() { return UINT_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = false;\r
+ };\r
+\r
+ template<> struct numeric_limits<long>\r
+ {\r
+ typedef long type;\r
+ __device__ __forceinline__ static type min() { return LONG_MIN; };\r
+ __device__ __forceinline__ static type max() { return LONG_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = true;\r
+ };\r
+\r
+ template<> struct numeric_limits<unsigned long>\r
+ {\r
+ typedef unsigned long type;\r
+ __device__ __forceinline__ static type min() { return 0; };\r
+ __device__ __forceinline__ static type max() { return ULONG_MAX; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = false;\r
+ };\r
+\r
+ template<> struct numeric_limits<float>\r
+ {\r
+ typedef float type;\r
+ __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };\r
+ __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };\r
+ __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = true;\r
+ };\r
+\r
+ template<> struct numeric_limits<double>\r
+ {\r
+ typedef double type;\r
+ __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };\r
+ __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };\r
+ __device__ __forceinline__ static type epsilon();\r
+ __device__ __forceinline__ static type round_error();\r
+ __device__ __forceinline__ static type denorm_min();\r
+ __device__ __forceinline__ static type infinity();\r
+ __device__ __forceinline__ static type quiet_NaN();\r
+ __device__ __forceinline__ static type signaling_NaN();\r
+ static const bool is_signed = true;\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device {\r
\r
#endif // __OPENCV_GPU_LIMITS_GPU_HPP__\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }\r
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }\r
-\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)\r
-{ \r
- return (uchar) ::max((int)v, 0); \r
-}\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)\r
-{ \r
- return (uchar) ::min((uint)v, (uint)UCHAR_MAX); \r
-}\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)\r
-{ \r
- return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); \r
-}\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)\r
-{ \r
- return (uchar) ::min(v, (uint)UCHAR_MAX); \r
-}\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)\r
-{ \r
- return saturate_cast<uchar>((uint)v); \r
-}\r
-\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)\r
-{ \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<uchar>(iv); \r
-}\r
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)\r
+namespace cv { namespace gpu { namespace device\r
{\r
-#if __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
- return saturate_cast<uchar>(iv);\r
-#else\r
- return saturate_cast<uchar>((float)v);\r
-#endif\r
-}\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }\r
+ template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }\r
\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)\r
-{ \r
- return (schar) ::min((int)v, SCHAR_MAX); \r
-}\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)\r
-{ \r
- return (schar) ::min((uint)v, (uint)SCHAR_MAX); \r
-}\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)\r
-{\r
- return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);\r
-}\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)\r
-{ \r
- return saturate_cast<schar>((int)v); \r
-}\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)\r
-{ \r
- return (schar) ::min(v, (uint)SCHAR_MAX); \r
-}\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)\r
+ { \r
+ return (uchar) ::max((int)v, 0); \r
+ }\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)\r
+ { \r
+ return (uchar) ::min((uint)v, (uint)UCHAR_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)\r
+ { \r
+ return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); \r
+ }\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)\r
+ { \r
+ return (uchar) ::min(v, (uint)UCHAR_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)\r
+ { \r
+ return saturate_cast<uchar>((uint)v); \r
+ }\r
\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)\r
-{ \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<schar>(iv); \r
-}\r
-template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)\r
-{ \r
-#if __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
- return saturate_cast<schar>(iv);\r
-#else\r
- return saturate_cast<schar>((float)v);\r
-#endif\r
-}\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)\r
+ { \r
+ int iv = __float2int_rn(v); \r
+ return saturate_cast<uchar>(iv); \r
+ }\r
+ template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)\r
+ {\r
+ #if __CUDA_ARCH__ >= 130\r
+ int iv = __double2int_rn(v); \r
+ return saturate_cast<uchar>(iv);\r
+ #else\r
+ return saturate_cast<uchar>((float)v);\r
+ #endif\r
+ }\r
\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)\r
-{ \r
- return (ushort) ::max((int)v, 0); \r
-}\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)\r
-{ \r
- return (ushort) ::max((int)v, 0); \r
-}\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)\r
-{ \r
- return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); \r
-}\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)\r
-{ \r
- return (ushort) ::min(v, (uint)USHRT_MAX); \r
-}\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)\r
-{\r
- int iv = __float2int_rn(v); \r
- return saturate_cast<ushort>(iv); \r
-}\r
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)\r
-{ \r
-#if __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
- return saturate_cast<ushort>(iv);\r
-#else\r
- return saturate_cast<ushort>((float)v);\r
-#endif\r
-}\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)\r
+ { \r
+ return (schar) ::min((int)v, SCHAR_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)\r
+ { \r
+ return (schar) ::min((uint)v, (uint)SCHAR_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)\r
+ {\r
+ return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);\r
+ }\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)\r
+ { \r
+ return saturate_cast<schar>((int)v); \r
+ }\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)\r
+ { \r
+ return (schar) ::min(v, (uint)SCHAR_MAX); \r
+ }\r
\r
-template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)\r
-{ \r
- return (short) ::min((int)v, SHRT_MAX); \r
-}\r
-template<> __device__ __forceinline__ short saturate_cast<short>(int v)\r
-{\r
- return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);\r
-}\r
-template<> __device__ __forceinline__ short saturate_cast<short>(uint v)\r
-{ \r
- return (short) ::min(v, (uint)SHRT_MAX); \r
-}\r
-template<> __device__ __forceinline__ short saturate_cast<short>(float v)\r
-{ \r
- int iv = __float2int_rn(v); \r
- return saturate_cast<short>(iv); \r
-}\r
-template<> __device__ __forceinline__ short saturate_cast<short>(double v)\r
-{ \r
-#if __CUDA_ARCH__ >= 130\r
- int iv = __double2int_rn(v); \r
- return saturate_cast<short>(iv);\r
-#else\r
- return saturate_cast<short>((float)v);\r
-#endif\r
-}\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)\r
+ { \r
+ int iv = __float2int_rn(v); \r
+ return saturate_cast<schar>(iv); \r
+ }\r
+ template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)\r
+ { \r
+ #if __CUDA_ARCH__ >= 130\r
+ int iv = __double2int_rn(v); \r
+ return saturate_cast<schar>(iv);\r
+ #else\r
+ return saturate_cast<schar>((float)v);\r
+ #endif\r
+ }\r
\r
-template<> __device__ __forceinline__ int saturate_cast<int>(float v) \r
-{ \r
- return __float2int_rn(v); \r
-}\r
-template<> __device__ __forceinline__ int saturate_cast<int>(double v) \r
-{\r
-#if __CUDA_ARCH__ >= 130 \r
- return __double2int_rn(v);\r
-#else\r
- return saturate_cast<int>((float)v);\r
-#endif\r
-}\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)\r
+ { \r
+ return (ushort) ::max((int)v, 0); \r
+ }\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)\r
+ { \r
+ return (ushort) ::max((int)v, 0); \r
+ }\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)\r
+ { \r
+ return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); \r
+ }\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)\r
+ { \r
+ return (ushort) ::min(v, (uint)USHRT_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)\r
+ {\r
+ int iv = __float2int_rn(v); \r
+ return saturate_cast<ushort>(iv); \r
+ }\r
+ template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)\r
+ { \r
+ #if __CUDA_ARCH__ >= 130\r
+ int iv = __double2int_rn(v); \r
+ return saturate_cast<ushort>(iv);\r
+ #else\r
+ return saturate_cast<ushort>((float)v);\r
+ #endif\r
+ }\r
+\r
+ template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)\r
+ { \r
+ return (short) ::min((int)v, SHRT_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ short saturate_cast<short>(int v)\r
+ {\r
+ return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);\r
+ }\r
+ template<> __device__ __forceinline__ short saturate_cast<short>(uint v)\r
+ { \r
+ return (short) ::min(v, (uint)SHRT_MAX); \r
+ }\r
+ template<> __device__ __forceinline__ short saturate_cast<short>(float v)\r
+ { \r
+ int iv = __float2int_rn(v); \r
+ return saturate_cast<short>(iv); \r
+ }\r
+ template<> __device__ __forceinline__ short saturate_cast<short>(double v)\r
+ { \r
+ #if __CUDA_ARCH__ >= 130\r
+ int iv = __double2int_rn(v); \r
+ return saturate_cast<short>(iv);\r
+ #else\r
+ return saturate_cast<short>((float)v);\r
+ #endif\r
+ }\r
\r
-template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)\r
-{ \r
- return __float2uint_rn(v); \r
-}\r
-template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) \r
-{ \r
-#if __CUDA_ARCH__ >= 130\r
- return __double2uint_rn(v);\r
-#else\r
- return saturate_cast<uint>((float)v);\r
-#endif\r
-}\r
+ template<> __device__ __forceinline__ int saturate_cast<int>(float v) \r
+ { \r
+ return __float2int_rn(v); \r
+ }\r
+ template<> __device__ __forceinline__ int saturate_cast<int>(double v) \r
+ {\r
+ #if __CUDA_ARCH__ >= 130 \r
+ return __double2int_rn(v);\r
+ #else\r
+ return saturate_cast<int>((float)v);\r
+ #endif\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)\r
+ { \r
+ return __float2uint_rn(v); \r
+ }\r
+ template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) \r
+ { \r
+ #if __CUDA_ARCH__ >= 130\r
+ return __double2uint_rn(v);\r
+ #else\r
+ return saturate_cast<uint>((float)v);\r
+ #endif\r
+ }\r
+}}}\r
\r
#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
#define __OPENCV_GPU_HOST_DEVICE__\r
#endif \r
\r
-namespace cv\r
-{\r
- namespace gpu\r
+namespace cv { namespace gpu \r
+{ \r
+ namespace device\r
{\r
- namespace device\r
- {\r
- template<bool expr> struct Static {};\r
- \r
- template<> struct Static<true> \r
- { \r
- __OPENCV_GPU_HOST_DEVICE__ static void check() {}; \r
- };\r
- } \r
+ template<bool expr> struct Static {};\r
+ \r
+ template<> struct Static<true> \r
+ { \r
+ __OPENCV_GPU_HOST_DEVICE__ static void check() {}; \r
+ };\r
+ } \r
\r
- using cv::gpu::device::Static;\r
- }\r
-}\r
+ using ::cv::gpu::device::Static;\r
+}}\r
\r
-#undef __PCL_GPU_HOST_DEVICE__\r
+#undef __OPENCV_GPU_HOST_DEVICE__\r
\r
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
#include "utility.hpp"\r
#include "detail/transform_detail.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T, typename D, typename UnOp>\r
-void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)\r
-{\r
- detail::transform_caller(src, dst, op, WithOutMask(), stream);\r
-}\r
-template <typename T, typename D, typename UnOp>\r
-void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)\r
+namespace cv { namespace gpu { namespace device \r
{\r
- detail::transform_caller(src, dst, op, SingleMask(mask), stream);\r
-}\r
+ template <typename T, typename D, typename UnOp>\r
+ void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)\r
+ {\r
+ transform_detail::transform_caller(src, dst, op, WithOutMask(), stream);\r
+ }\r
\r
-template <typename T1, typename T2, typename D, typename BinOp>\r
-void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)\r
-{\r
- detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);\r
-}\r
-template <typename T1, typename T2, typename D, typename BinOp>\r
-void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)\r
-{\r
- detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);\r
-}\r
+ template <typename T, typename D, typename UnOp>\r
+ void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)\r
+ {\r
+ transform_detail::transform_caller(src, dst, op, SingleMask(mask), stream);\r
+ }\r
+\r
+ template <typename T1, typename T2, typename D, typename BinOp>\r
+ void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)\r
+ {\r
+ transform_detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template <typename T1, typename T2, typename D, typename BinOp>\r
+ void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)\r
+ {\r
+ transform_detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);\r
+ }\r
+}}}\r
\r
#endif // __OPENCV_GPU_TRANSFORM_HPP__\r
#include "internal_shared.hpp"\r
#include "detail/type_traits_detail.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T> struct IsSimpleParameter\r
-{\r
- enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};\r
-};\r
-\r
-template <typename T> struct TypeTraits\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef typename detail::UnConst<T>::type NonConstType;\r
- typedef typename detail::UnVolatile<T>::type NonVolatileType;\r
- typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type UnqualifiedType;\r
- typedef typename detail::PointerTraits<UnqualifiedType>::type PointeeType;\r
- typedef typename detail::ReferenceTraits<T>::type ReferredType;\r
+ template <typename T> struct IsSimpleParameter\r
+ {\r
+ enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value || \r
+ type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};\r
+ };\r
\r
- enum { isConst = detail::UnConst<T>::value };\r
- enum { isVolatile = detail::UnVolatile<T>::value };\r
+ template <typename T> struct TypeTraits\r
+ {\r
+ typedef typename type_traits_detail::UnConst<T>::type NonConstType;\r
+ typedef typename type_traits_detail::UnVolatile<T>::type NonVolatileType;\r
+ typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;\r
+ typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type PointeeType;\r
+ typedef typename type_traits_detail::ReferenceTraits<T>::type ReferredType;\r
\r
- enum { isReference = detail::ReferenceTraits<UnqualifiedType>::value };\r
- enum { isPointer = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value }; \r
+ enum { isConst = type_traits_detail::UnConst<T>::value };\r
+ enum { isVolatile = type_traits_detail::UnVolatile<T>::value };\r
\r
- enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
- enum { isSignedInt = detail::IsSignedIntergral<UnqualifiedType>::value };\r
- enum { isIntegral = detail::IsIntegral<UnqualifiedType>::value };\r
- enum { isFloat = detail::IsFloat<UnqualifiedType>::value };\r
- enum { isArith = isIntegral || isFloat };\r
- enum { isVec = detail::IsVec<UnqualifiedType>::value };\r
- \r
- typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;\r
-};\r
+ enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };\r
+ enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value }; \r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
+ enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };\r
+ enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };\r
+ enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };\r
+ enum { isArith = isIntegral || isFloat };\r
+ enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };\r
+ \r
+ typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value, \r
+ T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;\r
+ };\r
+}}}\r
\r
#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__\r
#include "datamov_utils.hpp"\r
#include "detail/utility_detail.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-#define OPENCV_GPU_LOG_WARP_SIZE (5)\r
-#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)\r
-#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla\r
-#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)\r
-\r
-///////////////////////////////////////////////////////////////////////////////\r
-// swap\r
-\r
-template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) \r
+namespace cv { namespace gpu { namespace device \r
{\r
- const T temp = a;\r
- a = b;\r
- b = temp;\r
-}\r
+ #define OPENCV_GPU_LOG_WARP_SIZE (5)\r
+ #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)\r
+ #define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla\r
+ #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Mask Reader\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // swap\r
\r
-struct SingleMask\r
-{\r
- explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}\r
- \r
- __device__ __forceinline__ bool operator()(int y, int x) const\r
- { \r
- return mask.ptr(y)[x] != 0;\r
+ template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) \r
+ {\r
+ const T temp = a;\r
+ a = b;\r
+ b = temp;\r
}\r
\r
- const PtrStepb mask;\r
-};\r
-\r
-struct MaskCollection\r
-{\r
- explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Mask Reader\r
\r
- __device__ __forceinline__ void next()\r
- {\r
- curMask = *maskCollection++;\r
- }\r
- __device__ __forceinline__ void setMask(int z)\r
+ struct SingleMask\r
{\r
- curMask = maskCollection[z];\r
- }\r
- \r
- __device__ __forceinline__ bool operator()(int y, int x) const\r
- {\r
- uchar val;\r
- return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));\r
- }\r
+ explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}\r
+ \r
+ __device__ __forceinline__ bool operator()(int y, int x) const\r
+ { \r
+ return mask.ptr(y)[x] != 0;\r
+ }\r
\r
- const PtrStepb* maskCollection;\r
- PtrStepb curMask;\r
-};\r
+ const PtrStepb mask;\r
+ };\r
\r
-struct WithOutMask\r
-{\r
- __device__ __forceinline__ void next() const\r
+ struct MaskCollection\r
{\r
- }\r
- __device__ __forceinline__ void setMask(int) const\r
+ explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}\r
+\r
+ __device__ __forceinline__ void next()\r
+ {\r
+ curMask = *maskCollection++;\r
+ }\r
+ __device__ __forceinline__ void setMask(int z)\r
+ {\r
+ curMask = maskCollection[z];\r
+ }\r
+ \r
+ __device__ __forceinline__ bool operator()(int y, int x) const\r
+ {\r
+ uchar val;\r
+ return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));\r
+ }\r
+\r
+ const PtrStepb* maskCollection;\r
+ PtrStepb curMask;\r
+ };\r
+\r
+ struct WithOutMask\r
{\r
- }\r
-\r
- __device__ __forceinline__ bool operator()(int, int) const\r
+ __device__ __forceinline__ void next() const\r
+ {\r
+ }\r
+ __device__ __forceinline__ void setMask(int) const\r
+ {\r
+ }\r
+\r
+ __device__ __forceinline__ bool operator()(int, int) const\r
+ {\r
+ return true;\r
+ }\r
+\r
+ __device__ __forceinline__ bool operator()(int, int, int) const\r
+ {\r
+ return true;\r
+ }\r
+\r
+ static __device__ __forceinline__ bool check(int, int)\r
+ {\r
+ return true;\r
+ }\r
+\r
+ static __device__ __forceinline__ bool check(int, int, int)\r
+ {\r
+ return true;\r
+ }\r
+ };\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Reduction\r
+\r
+ template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
{\r
- return true;\r
+ StaticAssert<n >= 8 && n <= 512>::check();\r
+ utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);\r
}\r
\r
- __device__ __forceinline__ bool operator()(int, int, int) const\r
+ template <int n, typename T, typename V, typename Pred> \r
+ __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)\r
{\r
- return true;\r
+ StaticAssert<n >= 8 && n <= 512>::check();\r
+ utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);\r
}\r
\r
- static __device__ __forceinline__ bool check(int, int)\r
+ template <int n, typename T, typename V1, typename V2, typename Pred> \r
+ __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)\r
{\r
- return true;\r
+ StaticAssert<n >= 8 && n <= 512>::check();\r
+ utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
}\r
-\r
- static __device__ __forceinline__ bool check(int, int, int)\r
- {\r
- return true;\r
- }\r
-};\r
-\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Reduction\r
-\r
-template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
-{\r
- StaticAssert<n >= 8 && n <= 512>::check();\r
- detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);\r
-}\r
-\r
-template <int n, typename T, typename V, typename Pred> \r
-__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)\r
-{\r
- StaticAssert<n >= 8 && n <= 512>::check();\r
- detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);\r
-}\r
-\r
-template <int n, typename T, typename V1, typename V2, typename Pred> \r
-__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)\r
-{\r
- StaticAssert<n >= 8 && n <= 512>::check();\r
- detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
-}\r
\r
-///////////////////////////////////////////////////////////////////////////////\r
-// Solve linear system\r
-\r
-// solve 2x2 linear system Ax=b\r
-template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])\r
-{\r
- T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Solve linear system\r
\r
- if (det != 0)\r
+ // solve 2x2 linear system Ax=b\r
+ template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])\r
{\r
- double invdet = 1.0 / det;\r
+ T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];\r
\r
- x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));\r
+ if (det != 0)\r
+ {\r
+ double invdet = 1.0 / det;\r
\r
- x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));\r
+ x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));\r
\r
- return true;\r
- }\r
+ x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));\r
\r
- return false;\r
-}\r
+ return true;\r
+ }\r
\r
-// solve 3x3 linear system Ax=b\r
-template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])\r
-{\r
- T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])\r
- - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])\r
- + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);\r
+ return false;\r
+ }\r
\r
- if (det != 0)\r
+ // solve 3x3 linear system Ax=b\r
+ template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])\r
{\r
- double invdet = 1.0 / det;\r
+ T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])\r
+ - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])\r
+ + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);\r
\r
- x[0] = saturate_cast<T>(invdet * \r
- (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -\r
- A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +\r
- A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));\r
+ if (det != 0)\r
+ {\r
+ double invdet = 1.0 / det;\r
\r
- x[1] = saturate_cast<T>(invdet * \r
- (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -\r
- b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +\r
- A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));\r
+ x[0] = saturate_cast<T>(invdet * \r
+ (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -\r
+ A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +\r
+ A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));\r
\r
- x[2] = saturate_cast<T>(invdet * \r
- (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -\r
- A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +\r
- b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));\r
+ x[1] = saturate_cast<T>(invdet * \r
+ (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -\r
+ b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +\r
+ A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));\r
\r
- return true;\r
- }\r
+ x[2] = saturate_cast<T>(invdet * \r
+ (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -\r
+ A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +\r
+ b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));\r
\r
- return false;\r
-}\r
+ return true;\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ return false;\r
+ }\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_UTILITY_HPP__\r
#include "functional.hpp"\r
#include "detail/vec_distance_detail.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template <typename T> struct L1Dist\r
+namespace cv { namespace gpu { namespace device \r
{\r
- typedef int value_type;\r
- typedef int result_type;\r
-\r
- __device__ __forceinline__ L1Dist() : mySum(0) {}\r
-\r
- __device__ __forceinline__ void reduceIter(int val1, int val2)\r
+ template <typename T> struct L1Dist\r
{\r
- mySum = __sad(val1, val2, mySum);\r
- }\r
+ typedef int value_type;\r
+ typedef int result_type;\r
\r
- template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
- {\r
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
- }\r
+ __device__ __forceinline__ L1Dist() : mySum(0) {}\r
\r
- __device__ __forceinline__ operator int() const\r
- {\r
- return mySum;\r
- }\r
+ __device__ __forceinline__ void reduceIter(int val1, int val2)\r
+ {\r
+ mySum = __sad(val1, val2, mySum);\r
+ }\r
\r
- int mySum;\r
-};\r
-template <> struct L1Dist<float>\r
-{\r
- typedef float value_type;\r
- typedef float result_type;\r
+ template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
+ {\r
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
+ }\r
\r
- __device__ __forceinline__ L1Dist() : mySum(0.0f) {}\r
+ __device__ __forceinline__ operator int() const\r
+ {\r
+ return mySum;\r
+ }\r
\r
- __device__ __forceinline__ void reduceIter(float val1, float val2)\r
+ int mySum;\r
+ };\r
+ template <> struct L1Dist<float>\r
{\r
- mySum += ::fabs(val1 - val2);\r
- }\r
+ typedef float value_type;\r
+ typedef float result_type;\r
\r
- template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
- {\r
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
- }\r
+ __device__ __forceinline__ L1Dist() : mySum(0.0f) {}\r
\r
- __device__ __forceinline__ operator float() const\r
- {\r
- return mySum;\r
- }\r
+ __device__ __forceinline__ void reduceIter(float val1, float val2)\r
+ {\r
+ mySum += ::fabs(val1 - val2);\r
+ }\r
\r
- float mySum;\r
-};\r
+ template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
+ {\r
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
+ }\r
\r
-struct L2Dist\r
-{\r
- typedef float value_type;\r
- typedef float result_type;\r
+ __device__ __forceinline__ operator float() const\r
+ {\r
+ return mySum;\r
+ }\r
\r
- __device__ __forceinline__ L2Dist() : mySum(0.0f) {}\r
+ float mySum;\r
+ };\r
\r
- __device__ __forceinline__ void reduceIter(float val1, float val2)\r
+ struct L2Dist\r
{\r
- float reg = val1 - val2;\r
- mySum += reg * reg;\r
- }\r
+ typedef float value_type;\r
+ typedef float result_type;\r
\r
- template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
- {\r
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
- }\r
+ __device__ __forceinline__ L2Dist() : mySum(0.0f) {}\r
\r
- __device__ __forceinline__ operator float() const\r
- {\r
- return sqrtf(mySum);\r
- }\r
+ __device__ __forceinline__ void reduceIter(float val1, float val2)\r
+ {\r
+ float reg = val1 - val2;\r
+ mySum += reg * reg;\r
+ }\r
\r
- float mySum;\r
-};\r
+ template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
+ {\r
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
+ }\r
\r
-struct HammingDist\r
-{\r
- typedef int value_type;\r
- typedef int result_type;\r
+ __device__ __forceinline__ operator float() const\r
+ {\r
+ return sqrtf(mySum);\r
+ }\r
\r
- __device__ __forceinline__ HammingDist() : mySum(0) {}\r
+ float mySum;\r
+ };\r
\r
- __device__ __forceinline__ void reduceIter(int val1, int val2)\r
+ struct HammingDist\r
{\r
- mySum += __popc(val1 ^ val2);\r
- }\r
+ typedef int value_type;\r
+ typedef int result_type;\r
\r
- template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
- {\r
- reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
- }\r
+ __device__ __forceinline__ HammingDist() : mySum(0) {}\r
\r
- __device__ __forceinline__ operator int() const\r
- {\r
- return mySum;\r
- }\r
+ __device__ __forceinline__ void reduceIter(int val1, int val2)\r
+ {\r
+ mySum += __popc(val1 ^ val2);\r
+ }\r
\r
- int mySum;\r
-};\r
+ template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
+ {\r
+ reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
+ }\r
\r
-// calc distance between two vectors in global memory\r
-template <int THREAD_DIM, typename Dist, typename T1, typename T2> \r
-__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
-{\r
- for (int i = tid; i < len; i += THREAD_DIM)\r
- {\r
- T1 val1;\r
- ForceGlob<T1>::Load(vec1, i, val1);\r
+ __device__ __forceinline__ operator int() const\r
+ {\r
+ return mySum;\r
+ }\r
\r
- T2 val2;\r
- ForceGlob<T2>::Load(vec2, i, val2);\r
+ int mySum;\r
+ };\r
\r
- dist.reduceIter(val1, val2);\r
- }\r
+ // calc distance between two vectors in global memory\r
+ template <int THREAD_DIM, typename Dist, typename T1, typename T2> \r
+ __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
+ {\r
+ for (int i = tid; i < len; i += THREAD_DIM)\r
+ {\r
+ T1 val1;\r
+ ForceGlob<T1>::Load(vec1, i, val1);\r
\r
- dist.reduceAll<THREAD_DIM>(smem, tid);\r
-}\r
+ T2 val2;\r
+ ForceGlob<T2>::Load(vec2, i, val2);\r
\r
-// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory\r
-template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>\r
-__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
-{ \r
- detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);\r
- \r
- dist.reduceAll<THREAD_DIM>(smem, tid);\r
-}\r
+ dist.reduceIter(val1, val2);\r
+ }\r
\r
-// calc distance between two vectors in global memory\r
-template <int THREAD_DIM, typename T1> struct VecDiffGlobal\r
-{\r
- explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)\r
- {\r
- vec1 = vec1_;\r
+ dist.reduceAll<THREAD_DIM>(smem, tid);\r
}\r
\r
- template <typename T2, typename Dist>\r
- __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
- {\r
- calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);\r
+ // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory\r
+ template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>\r
+ __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
+ { \r
+ vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);\r
+ \r
+ dist.reduceAll<THREAD_DIM>(smem, tid);\r
}\r
\r
- const T1* vec1;\r
-};\r
-\r
-// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory\r
-template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister\r
-{\r
- template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)\r
+ // calc distance between two vectors in global memory\r
+ template <int THREAD_DIM, typename T1> struct VecDiffGlobal\r
{\r
- if (glob_tid < len)\r
- smem[glob_tid] = vec1[glob_tid];\r
- __syncthreads();\r
-\r
- U* vec1ValsPtr = vec1Vals;\r
+ explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)\r
+ {\r
+ vec1 = vec1_;\r
+ }\r
\r
- #pragma unroll\r
- for (int i = tid; i < MAX_LEN; i += THREAD_DIM)\r
- *vec1ValsPtr++ = smem[i];\r
+ template <typename T2, typename Dist>\r
+ __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
+ {\r
+ calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);\r
+ }\r
\r
- __syncthreads();\r
- }\r
+ const T1* vec1;\r
+ };\r
\r
- template <typename T2, typename Dist>\r
- __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
+ // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory\r
+ template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister\r
{\r
- calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);\r
- }\r
+ template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)\r
+ {\r
+ if (glob_tid < len)\r
+ smem[glob_tid] = vec1[glob_tid];\r
+ __syncthreads();\r
+\r
+ U* vec1ValsPtr = vec1Vals;\r
+\r
+ #pragma unroll\r
+ for (int i = tid; i < MAX_LEN; i += THREAD_DIM)\r
+ *vec1ValsPtr++ = smem[i];\r
+\r
+ __syncthreads();\r
+ }\r
\r
- U vec1Vals[MAX_LEN / THREAD_DIM];\r
-};\r
+ template <typename T2, typename Dist>\r
+ __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
+ {\r
+ calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);\r
+ }\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ U vec1Vals[MAX_LEN / THREAD_DIM];\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__\r
#include "vec_traits.hpp"\r
#include "functional.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace detail\r
+namespace cv { namespace gpu { namespace device \r
{\r
- template <int cn, typename VecD> struct SatCastHelper;\r
- template <typename VecD> struct SatCastHelper<1, VecD>\r
+ namespace vec_math_detail\r
{\r
- template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ template <int cn, typename VecD> struct SatCastHelper;\r
+ template <typename VecD> struct SatCastHelper<1, VecD>\r
{\r
- typedef typename VecTraits<VecD>::elem_type D;\r
- return VecTraits<VecD>::make(saturate_cast<D>(v.x));\r
- }\r
- };\r
- template <typename VecD> struct SatCastHelper<2, VecD>\r
- {\r
- template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ {\r
+ typedef typename VecTraits<VecD>::elem_type D;\r
+ return VecTraits<VecD>::make(saturate_cast<D>(v.x));\r
+ }\r
+ };\r
+ template <typename VecD> struct SatCastHelper<2, VecD>\r
{\r
- typedef typename VecTraits<VecD>::elem_type D;\r
- return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));\r
- }\r
- };\r
- template <typename VecD> struct SatCastHelper<3, VecD>\r
- {\r
- template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ {\r
+ typedef typename VecTraits<VecD>::elem_type D;\r
+ return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));\r
+ }\r
+ };\r
+ template <typename VecD> struct SatCastHelper<3, VecD>\r
{\r
- typedef typename VecTraits<VecD>::elem_type D;\r
- return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));\r
- }\r
- };\r
- template <typename VecD> struct SatCastHelper<4, VecD>\r
- {\r
- template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ {\r
+ typedef typename VecTraits<VecD>::elem_type D;\r
+ return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));\r
+ }\r
+ };\r
+ template <typename VecD> struct SatCastHelper<4, VecD>\r
{\r
- typedef typename VecTraits<VecD>::elem_type D;\r
- return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));\r
- }\r
- };\r
+ template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
+ {\r
+ typedef typename VecTraits<VecD>::elem_type D;\r
+ return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));\r
+ }\r
+ };\r
\r
- template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)\r
- {\r
- return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);\r
+ template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)\r
+ {\r
+ return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);\r
+ }\r
}\r
-}\r
\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
+ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}\r
\r
#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \\r
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \\r
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \\r
}\r
\r
-namespace detail\r
-{ \r
- template <typename T1, typename T2> struct BinOpTraits\r
- {\r
- typedef int argument_type;\r
- };\r
- template <typename T> struct BinOpTraits<T, T>\r
- {\r
- typedef T argument_type;\r
- };\r
- template <typename T> struct BinOpTraits<T, double>\r
- {\r
- typedef double argument_type;\r
- };\r
- template <typename T> struct BinOpTraits<double, T>\r
- {\r
- typedef double argument_type;\r
- };\r
- template <> struct BinOpTraits<double, double>\r
- {\r
- typedef double argument_type;\r
- };\r
- template <typename T> struct BinOpTraits<T, float>\r
- {\r
- typedef float argument_type;\r
- };\r
- template <typename T> struct BinOpTraits<float, T>\r
- {\r
- typedef float argument_type;\r
- };\r
- template <> struct BinOpTraits<float, float>\r
- {\r
- typedef float argument_type;\r
- };\r
- template <> struct BinOpTraits<double, float>\r
- {\r
- typedef double argument_type;\r
- };\r
- template <> struct BinOpTraits<float, double>\r
- {\r
- typedef double argument_type;\r
- };\r
-}\r
+ namespace vec_math_detail\r
+ { \r
+ template <typename T1, typename T2> struct BinOpTraits\r
+ {\r
+ typedef int argument_type;\r
+ };\r
+ template <typename T> struct BinOpTraits<T, T>\r
+ {\r
+ typedef T argument_type;\r
+ };\r
+ template <typename T> struct BinOpTraits<T, double>\r
+ {\r
+ typedef double argument_type;\r
+ };\r
+ template <typename T> struct BinOpTraits<double, T>\r
+ {\r
+ typedef double argument_type;\r
+ };\r
+ template <> struct BinOpTraits<double, double>\r
+ {\r
+ typedef double argument_type;\r
+ };\r
+ template <typename T> struct BinOpTraits<T, float>\r
+ {\r
+ typedef float argument_type;\r
+ };\r
+ template <typename T> struct BinOpTraits<float, T>\r
+ {\r
+ typedef float argument_type;\r
+ };\r
+ template <> struct BinOpTraits<float, float>\r
+ {\r
+ typedef float argument_type;\r
+ };\r
+ template <> struct BinOpTraits<double, float>\r
+ {\r
+ typedef double argument_type;\r
+ };\r
+ template <> struct BinOpTraits<float, double>\r
+ {\r
+ typedef double argument_type;\r
+ };\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \\r
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \\r
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \\r
} \\r
__device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \\r
{ \\r
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \\r
} \\r
__device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \\r
{ \\r
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \\r
} \\r
__device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \\r
{ \\r
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \\r
{ \\r
- func<typename detail::BinOpTraits<type, T>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \\r
+ func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \\r
} \\r
template <typename T> \\r
- __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \\r
+ __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \\r
{ \\r
- func<typename detail::BinOpTraits<T, type>::argument_type> f; \\r
- return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \\r
+ func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \\r
+ return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \\r
}\r
\r
#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \\r
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \\r
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)\r
\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)\r
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)\r
-OPENCV_GPU_IMPLEMENT_VEC_OP(float)\r
-OPENCV_GPU_IMPLEMENT_VEC_OP(double)\r
-\r
-#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP\r
-#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP\r
-#undef OPENCV_GPU_IMPLEMENT_VEC_OP\r
-#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)\r
+ OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)\r
+ OPENCV_GPU_IMPLEMENT_VEC_OP(float)\r
+ OPENCV_GPU_IMPLEMENT_VEC_OP(double)\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP\r
+ #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP\r
+ #undef OPENCV_GPU_IMPLEMENT_VEC_OP\r
+ #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-template<typename T, int N> struct TypeVec;\r
-\r
-struct __align__(8) uchar8\r
-{\r
- uchar a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)\r
-{\r
- uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(8) char8\r
-{\r
- schar a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)\r
-{\r
- char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(16) ushort8\r
-{\r
- ushort a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)\r
-{\r
- ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(16) short8\r
-{\r
- short a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)\r
-{\r
- short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(32) uint8\r
+namespace cv { namespace gpu { namespace device \r
{\r
- uint a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)\r
-{\r
- uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(32) int8\r
-{\r
- int a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)\r
-{\r
- int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct __align__(32) float8\r
-{\r
- float a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)\r
-{\r
- float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
-struct double8\r
-{\r
- double a0, a1, a2, a3, a4, a5, a6, a7;\r
-};\r
-static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)\r
-{\r
- double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
- return val;\r
-}\r
+ template<typename T, int N> struct TypeVec;\r
+\r
+ struct __align__(8) uchar8\r
+ {\r
+ uchar a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)\r
+ {\r
+ uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(8) char8\r
+ {\r
+ schar a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)\r
+ {\r
+ char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(16) ushort8\r
+ {\r
+ ushort a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)\r
+ {\r
+ ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(16) short8\r
+ {\r
+ short a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)\r
+ {\r
+ short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(32) uint8\r
+ {\r
+ uint a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)\r
+ {\r
+ uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(32) int8\r
+ {\r
+ int a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)\r
+ {\r
+ int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct __align__(32) float8\r
+ {\r
+ float a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)\r
+ {\r
+ float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
+ struct double8\r
+ {\r
+ double a0, a1, a2, a3, a4, a5, a6, a7;\r
+ };\r
+ static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)\r
+ {\r
+ double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+ return val;\r
+ }\r
\r
#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \\r
template<> struct TypeVec<type, 1> { typedef type vec_type; }; \\r
template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \\r
template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };\r
\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)\r
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)\r
+ OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)\r
\r
-#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC\r
+ #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC\r
\r
-template<> struct TypeVec<schar, 1> { typedef schar vec_type; };\r
-template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };\r
-template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };\r
-template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };\r
-template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };\r
+ template<> struct TypeVec<schar, 1> { typedef schar vec_type; };\r
+ template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };\r
+ template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };\r
+ template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };\r
+ template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };\r
\r
-template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };\r
-template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };\r
-template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };\r
-template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };\r
-template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };\r
+ template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };\r
+ template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };\r
+ template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };\r
+ template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };\r
+ template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };\r
\r
template<typename T> struct VecTraits;\r
\r
static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \\r
};\r
\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)\r
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)\r
+ OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)\r
\r
-#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS\r
+ #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS\r
\r
-template<> struct VecTraits<char> \r
-{ \r
- typedef char elem_type; \r
- enum {cn=1}; \r
- static __device__ __host__ __forceinline__ char all(char v) {return v;}\r
- static __device__ __host__ __forceinline__ char make(char x) {return x;}\r
- static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}\r
-};\r
-template<> struct VecTraits<schar> \r
-{ \r
- typedef schar elem_type; \r
- enum {cn=1}; \r
- static __device__ __host__ __forceinline__ schar all(schar v) {return v;}\r
- static __device__ __host__ __forceinline__ schar make(schar x) {return x;}\r
- static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}\r
-};\r
-template<> struct VecTraits<char1>\r
-{\r
- typedef schar elem_type;\r
- enum {cn=1};\r
- static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}\r
- static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}\r
- static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}\r
-};\r
-template<> struct VecTraits<char2>\r
-{\r
- typedef schar elem_type;\r
- enum {cn=2};\r
- static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}\r
- static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}\r
- static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}\r
-};\r
-template<> struct VecTraits<char3>\r
-{\r
- typedef schar elem_type;\r
- enum {cn=3};\r
- static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}\r
- static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}\r
- static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}\r
-};\r
-template<> struct VecTraits<char4>\r
-{\r
- typedef schar elem_type;\r
- enum {cn=4};\r
- static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}\r
- static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}\r
- static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}\r
-};\r
-template<> struct VecTraits<char8>\r
-{\r
- typedef schar elem_type;\r
- enum {cn=8};\r
- static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}\r
- static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}\r
- static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}\r
-};\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ template<> struct VecTraits<char> \r
+ { \r
+ typedef char elem_type; \r
+ enum {cn=1}; \r
+ static __device__ __host__ __forceinline__ char all(char v) {return v;}\r
+ static __device__ __host__ __forceinline__ char make(char x) {return x;}\r
+ static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}\r
+ };\r
+ template<> struct VecTraits<schar> \r
+ { \r
+ typedef schar elem_type; \r
+ enum {cn=1}; \r
+ static __device__ __host__ __forceinline__ schar all(schar v) {return v;}\r
+ static __device__ __host__ __forceinline__ schar make(schar x) {return x;}\r
+ static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}\r
+ };\r
+ template<> struct VecTraits<char1>\r
+ {\r
+ typedef schar elem_type;\r
+ enum {cn=1};\r
+ static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}\r
+ static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}\r
+ static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}\r
+ };\r
+ template<> struct VecTraits<char2>\r
+ {\r
+ typedef schar elem_type;\r
+ enum {cn=2};\r
+ static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}\r
+ static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}\r
+ static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}\r
+ };\r
+ template<> struct VecTraits<char3>\r
+ {\r
+ typedef schar elem_type;\r
+ enum {cn=3};\r
+ static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}\r
+ static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}\r
+ static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}\r
+ };\r
+ template<> struct VecTraits<char4>\r
+ {\r
+ typedef schar elem_type;\r
+ enum {cn=4};\r
+ static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}\r
+ static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}\r
+ static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}\r
+ };\r
+ template<> struct VecTraits<char8>\r
+ {\r
+ typedef schar elem_type;\r
+ enum {cn=8};\r
+ static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}\r
+ static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}\r
+ static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif // __OPENCV_GPU_VEC_TRAITS_HPP__\r
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-struct Warp\r
+namespace cv { namespace gpu { namespace device \r
{\r
- enum\r
- {\r
- LOG_WARP_SIZE = 5,\r
- WARP_SIZE = 1 << LOG_WARP_SIZE,\r
- STRIDE = WARP_SIZE\r
- };\r
-\r
- /** \brief Returns the warp lane ID of the calling thread. */\r
- static __device__ __forceinline__ unsigned int laneId()\r
+ struct Warp\r
{\r
- unsigned int ret;\r
- asm("mov.u32 %0, %laneid;" : "=r"(ret) );\r
- return ret;\r
- }\r
+ enum\r
+ {\r
+ LOG_WARP_SIZE = 5,\r
+ WARP_SIZE = 1 << LOG_WARP_SIZE,\r
+ STRIDE = WARP_SIZE\r
+ };\r
\r
- template<typename It, typename T>\r
- static __device__ __forceinline__ void fill(It beg, It end, const T& value)\r
- { \r
- for(It t = beg + laneId(); t < end; t += STRIDE)\r
- *t = value;\r
- } \r
+ /** \brief Returns the warp lane ID of the calling thread. */\r
+ static __device__ __forceinline__ unsigned int laneId()\r
+ {\r
+ unsigned int ret;\r
+ asm("mov.u32 %0, %laneid;" : "=r"(ret) );\r
+ return ret;\r
+ }\r
\r
- template<typename InIt, typename OutIt>\r
- static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)\r
- { \r
- for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
- *out = *t;\r
- return out;\r
- } \r
+ template<typename It, typename T>\r
+ static __device__ __forceinline__ void fill(It beg, It end, const T& value)\r
+ { \r
+ for(It t = beg + laneId(); t < end; t += STRIDE)\r
+ *t = value;\r
+ } \r
\r
- template<typename InIt, typename OutIt, class UnOp>\r
- static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)\r
- {\r
- for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
- *out = op(*t);\r
- return out;\r
- }\r
+ template<typename InIt, typename OutIt>\r
+ static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)\r
+ { \r
+ for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
+ *out = *t;\r
+ return out;\r
+ } \r
\r
- template<typename InIt1, typename InIt2, typename OutIt, class BinOp>\r
- static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)\r
- {\r
- unsigned int lane = laneId();\r
+ template<typename InIt, typename OutIt, class UnOp>\r
+ static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)\r
+ {\r
+ for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
+ *out = op(*t);\r
+ return out;\r
+ }\r
\r
- InIt1 t1 = beg1 + lane; \r
- InIt2 t2 = beg2 + lane;\r
- for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)\r
- *out = op(*t1, *t2);\r
- return out;\r
- }\r
+ template<typename InIt1, typename InIt2, typename OutIt, class BinOp>\r
+ static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)\r
+ {\r
+ unsigned int lane = laneId();\r
\r
- template<typename OutIt, typename T>\r
- static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)\r
- {\r
- unsigned int lane = laneId(); \r
- value += lane;\r
+ InIt1 t1 = beg1 + lane; \r
+ InIt2 t2 = beg2 + lane;\r
+ for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)\r
+ *out = op(*t1, *t2);\r
+ return out;\r
+ }\r
\r
- for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)\r
- *t = value;\r
- }\r
-};\r
+ template<typename OutIt, typename T>\r
+ static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)\r
+ {\r
+ unsigned int lane = laneId(); \r
+ value += lane;\r
\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)\r
+ *t = value;\r
+ }\r
+ };\r
+}}} // namespace cv { namespace gpu { namespace device\r
\r
#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
\ No newline at end of file
\r
#include "internal_shared.hpp"\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
- \r
-template <class T> \r
-__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )\r
-{\r
- const unsigned int lane = tid & 31; // index of thread in warp (0..31)\r
- \r
- if (lane < 16)\r
- { \r
- T partial = ptr[tid];\r
+namespace cv { namespace gpu { namespace device \r
+{ \r
+ template <class T> \r
+ __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)\r
+ {\r
+ const unsigned int lane = tid & 31; // index of thread in warp (0..31)\r
+ \r
+ if (lane < 16)\r
+ { \r
+ T partial = ptr[tid];\r
\r
- ptr[tid] = partial = partial + ptr[tid + 16];\r
- ptr[tid] = partial = partial + ptr[tid + 8];\r
- ptr[tid] = partial = partial + ptr[tid + 4];\r
- ptr[tid] = partial = partial + ptr[tid + 2];\r
- ptr[tid] = partial = partial + ptr[tid + 1]; \r
- }\r
+ ptr[tid] = partial = partial + ptr[tid + 16];\r
+ ptr[tid] = partial = partial + ptr[tid + 8];\r
+ ptr[tid] = partial = partial + ptr[tid + 4];\r
+ ptr[tid] = partial = partial + ptr[tid + 2];\r
+ ptr[tid] = partial = partial + ptr[tid + 1]; \r
+ }\r
\r
- return ptr[tid - lane];\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ return ptr[tid - lane];\r
+ }\r
+}}} // namespace cv { namespace gpu { namespace device {\r
\r
#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
\ No newline at end of file
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace split_merge \r
-{ \r
- void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);\r
- void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+namespace cv { namespace gpu { namespace device \r
+{\r
+ namespace split_merge \r
+ { \r
+ void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);\r
+ void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);\r
+ }\r
+}}}\r
\r
namespace\r
{\r
void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;\r
+ using namespace ::cv::gpu::device::split_merge;\r
\r
CV_Assert(src);\r
CV_Assert(n > 0);\r
\r
void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) \r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;\r
+ using namespace ::cv::gpu::device::split_merge;\r
\r
CV_Assert(dst);\r
\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereobm\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);\r
- void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);\r
- void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace stereobm\r
+ {\r
+ void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);\r
+ void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);\r
+ void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);\r
+ }\r
+}}}\r
\r
const float defaultAvgTexThreshold = 3;\r
\r
{\r
void stereo_bm_gpu_operator( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)\r
{\r
- using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;\r
+ using namespace ::cv::gpu::device::stereobm;\r
\r
CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);\r
CV_DbgAssert(left.type() == CV_8UC1);\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereobp\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);\r
- template<typename T, typename D>\r
- void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
- template<typename T>\r
- void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
- template <typename T>\r
- void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
- template <typename T>\r
- void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, \r
- const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
- template <typename T>\r
- void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, \r
- const DevMem2D_<short>& disp, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
-\r
-using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;\r
+ namespace stereobp\r
+ {\r
+ void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);\r
+ template<typename T, typename D>\r
+ void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
+ template<typename T>\r
+ void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+ template <typename T>\r
+ void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
+ template <typename T>\r
+ void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, \r
+ const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
+ template <typename T>\r
+ void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, \r
+ const DevMem2D_<short>& disp, cudaStream_t stream);\r
+ }\r
+}}}\r
+\r
+using namespace ::cv::gpu::device::stereobp;\r
\r
namespace\r
{\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace stereocsbp\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
- const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);\r
-\r
- template<class T>\r
- void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
- int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
-\r
- template<class T>\r
- void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
- int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
-\r
- template<class T>\r
- void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
- const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
- T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
- T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
- int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
-\r
- template<class T>\r
- void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
- const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
-\r
- template<class T> \r
- void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
- const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ namespace stereocsbp\r
+ {\r
+ void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
+ const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);\r
+\r
+ template<class T>\r
+ void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
+ int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
+\r
+ template<class T>\r
+ void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
+\r
+ template<class T>\r
+ void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
+ const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
+ T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
+ T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
+ int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
+\r
+ template<class T>\r
+ void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
+ const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+\r
+ template<class T> \r
+ void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
+ const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;\r
+using namespace ::cv::gpu::device::stereocsbp;\r
\r
namespace\r
{\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
-BEGIN_OPENCV_DEVICE_NAMESPACE\r
-\r
-namespace surf\r
+namespace cv { namespace gpu { namespace device \r
{\r
- void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);\r
- void loadOctaveConstants(int octave, int layer_rows, int layer_cols);\r
+ namespace surf\r
+ {\r
+ void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);\r
+ void loadOctaveConstants(int octave, int layer_rows, int layer_cols);\r
\r
- void bindImgTex(DevMem2Db img);\r
- void bindSumTex(DevMem2D_<unsigned int> sum);\r
- void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);\r
+ void bindImgTex(DevMem2Db img);\r
+ void bindSumTex(DevMem2D_<unsigned int> sum);\r
+ void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);\r
\r
- void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);\r
+ void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);\r
\r
- void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
- int img_rows, int img_cols, int octave, bool use_mask, int nLayers);\r
+ void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
+ int img_rows, int img_cols, int octave, bool use_mask, int nLayers);\r
\r
- void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
- float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
- unsigned int* featureCounter);\r
+ void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
+ float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
+ unsigned int* featureCounter);\r
\r
- void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);\r
+ void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);\r
\r
- void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
- const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);\r
-}\r
-\r
-END_OPENCV_DEVICE_NAMESPACE\r
+ void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
+ const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);\r
+ }\r
+}}}\r
\r
-using namespace OPENCV_DEVICE_NAMESPACE_ surf;\r
+using namespace ::cv::gpu::device::surf;\r
\r
namespace\r
{\r
\r
#ifndef DUMP\r
\r
- EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-4);\r
+ EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-3);\r
\r
#else\r
\r