From 810829f32ee604433cf99cbc679139e20407f938 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 9 Nov 2012 13:14:59 +0400 Subject: [PATCH] speedup compilation of row_filter.cu and column_filter.cu split them into several small files --- modules/gpu/src/cuda/column_filter.0.cu | 53 +++++ modules/gpu/src/cuda/column_filter.1.cu | 53 +++++ modules/gpu/src/cuda/column_filter.2.cu | 53 +++++ modules/gpu/src/cuda/column_filter.3.cu | 53 +++++ modules/gpu/src/cuda/column_filter.4.cu | 53 +++++ modules/gpu/src/cuda/column_filter.5.cu | 53 +++++ modules/gpu/src/cuda/column_filter.6.cu | 53 +++++ modules/gpu/src/cuda/column_filter.7.cu | 53 +++++ modules/gpu/src/cuda/column_filter.cu | 391 -------------------------------- modules/gpu/src/cuda/column_filter.h | 378 ++++++++++++++++++++++++++++++ modules/gpu/src/cuda/row_filter.0.cu | 53 +++++ modules/gpu/src/cuda/row_filter.1.cu | 53 +++++ modules/gpu/src/cuda/row_filter.2.cu | 53 +++++ modules/gpu/src/cuda/row_filter.3.cu | 53 +++++ modules/gpu/src/cuda/row_filter.4.cu | 53 +++++ modules/gpu/src/cuda/row_filter.5.cu | 53 +++++ modules/gpu/src/cuda/row_filter.6.cu | 53 +++++ modules/gpu/src/cuda/row_filter.7.cu | 53 +++++ modules/gpu/src/cuda/row_filter.cu | 390 ------------------------------- modules/gpu/src/cuda/row_filter.h | 377 ++++++++++++++++++++++++++++++ modules/gpu/src/filtering.cpp | 54 ++--- 21 files changed, 1625 insertions(+), 813 deletions(-) create mode 100644 modules/gpu/src/cuda/column_filter.0.cu create mode 100644 modules/gpu/src/cuda/column_filter.1.cu create mode 100644 modules/gpu/src/cuda/column_filter.2.cu create mode 100644 modules/gpu/src/cuda/column_filter.3.cu create mode 100644 modules/gpu/src/cuda/column_filter.4.cu create mode 100644 modules/gpu/src/cuda/column_filter.5.cu create mode 100644 modules/gpu/src/cuda/column_filter.6.cu create mode 100644 modules/gpu/src/cuda/column_filter.7.cu delete mode 100644 modules/gpu/src/cuda/column_filter.cu create mode 100644 modules/gpu/src/cuda/column_filter.h create mode 100644 modules/gpu/src/cuda/row_filter.0.cu create mode 100644 modules/gpu/src/cuda/row_filter.1.cu create mode 100644 modules/gpu/src/cuda/row_filter.2.cu create mode 100644 modules/gpu/src/cuda/row_filter.3.cu create mode 100644 modules/gpu/src/cuda/row_filter.4.cu create mode 100644 modules/gpu/src/cuda/row_filter.5.cu create mode 100644 modules/gpu/src/cuda/row_filter.6.cu create mode 100644 modules/gpu/src/cuda/row_filter.7.cu delete mode 100644 modules/gpu/src/cuda/row_filter.cu create mode 100644 modules/gpu/src/cuda/row_filter.h diff --git a/modules/gpu/src/cuda/column_filter.0.cu b/modules/gpu/src/cuda/column_filter.0.cu new file mode 100644 index 0000000..c35c6ee --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.1.cu b/modules/gpu/src/cuda/column_filter.1.cu new file mode 100644 index 0000000..9a2d6a0 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.2.cu b/modules/gpu/src/cuda/column_filter.2.cu new file mode 100644 index 0000000..05ee01c --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu new file mode 100644 index 0000000..1bf4921 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu new file mode 100644 index 0000000..bec7a08 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.5.cu b/modules/gpu/src/cuda/column_filter.5.cu new file mode 100644 index 0000000..8194ee3 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.6.cu b/modules/gpu/src/cuda/column_filter.6.cu new file mode 100644 index 0000000..d8fc49b --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.7.cu b/modules/gpu/src/cuda/column_filter.7.cu new file mode 100644 index 0000000..534bd82 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu deleted file mode 100644 index af7369a..0000000 --- a/modules/gpu/src/cuda/column_filter.cu +++ /dev/null @@ -1,391 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace column_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template - __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; - #else - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 2; - const int HALO_SIZE = 2; - #endif - - typedef typename TypeVec::cn>::vec_type sum_t; - - __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; - - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - - if (x >= src.cols) - return; - - const T* src_col = src.ptr() + x; - - const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - - if (blockIdx.y > 0) - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); - } - else - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); - } - - if (blockIdx.y + 2 < gridDim.y) - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); - } - else - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int y = yStart + j * BLOCK_DIM_Y; - - if (y < src.rows) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); - } - } - } - - template class B> - void linearColumnFilter_caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 16; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 2; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); - - B brd(src.rows); - - linearColumnFilter<<>>(src, dst, anchor, brd); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect101>, - linearColumnFilter_caller< 2, T, D, BrdColReflect101>, - linearColumnFilter_caller< 3, T, D, BrdColReflect101>, - linearColumnFilter_caller< 4, T, D, BrdColReflect101>, - linearColumnFilter_caller< 5, T, D, BrdColReflect101>, - linearColumnFilter_caller< 6, T, D, BrdColReflect101>, - linearColumnFilter_caller< 7, T, D, BrdColReflect101>, - linearColumnFilter_caller< 8, T, D, BrdColReflect101>, - linearColumnFilter_caller< 9, T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101>, - linearColumnFilter_caller<17, T, D, BrdColReflect101>, - linearColumnFilter_caller<18, T, D, BrdColReflect101>, - linearColumnFilter_caller<19, T, D, BrdColReflect101>, - linearColumnFilter_caller<20, T, D, BrdColReflect101>, - linearColumnFilter_caller<21, T, D, BrdColReflect101>, - linearColumnFilter_caller<22, T, D, BrdColReflect101>, - linearColumnFilter_caller<23, T, D, BrdColReflect101>, - linearColumnFilter_caller<24, T, D, BrdColReflect101>, - linearColumnFilter_caller<25, T, D, BrdColReflect101>, - linearColumnFilter_caller<26, T, D, BrdColReflect101>, - linearColumnFilter_caller<27, T, D, BrdColReflect101>, - linearColumnFilter_caller<28, T, D, BrdColReflect101>, - linearColumnFilter_caller<29, T, D, BrdColReflect101>, - linearColumnFilter_caller<30, T, D, BrdColReflect101>, - linearColumnFilter_caller<31, T, D, BrdColReflect101>, - linearColumnFilter_caller<32, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReplicate>, - linearColumnFilter_caller< 2, T, D, BrdColReplicate>, - linearColumnFilter_caller< 3, T, D, BrdColReplicate>, - linearColumnFilter_caller< 4, T, D, BrdColReplicate>, - linearColumnFilter_caller< 5, T, D, BrdColReplicate>, - linearColumnFilter_caller< 6, T, D, BrdColReplicate>, - linearColumnFilter_caller< 7, T, D, BrdColReplicate>, - linearColumnFilter_caller< 8, T, D, BrdColReplicate>, - linearColumnFilter_caller< 9, T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate>, - linearColumnFilter_caller<17, T, D, BrdColReplicate>, - linearColumnFilter_caller<18, T, D, BrdColReplicate>, - linearColumnFilter_caller<19, T, D, BrdColReplicate>, - linearColumnFilter_caller<20, T, D, BrdColReplicate>, - linearColumnFilter_caller<21, T, D, BrdColReplicate>, - linearColumnFilter_caller<22, T, D, BrdColReplicate>, - linearColumnFilter_caller<23, T, D, BrdColReplicate>, - linearColumnFilter_caller<24, T, D, BrdColReplicate>, - linearColumnFilter_caller<25, T, D, BrdColReplicate>, - linearColumnFilter_caller<26, T, D, BrdColReplicate>, - linearColumnFilter_caller<27, T, D, BrdColReplicate>, - linearColumnFilter_caller<28, T, D, BrdColReplicate>, - linearColumnFilter_caller<29, T, D, BrdColReplicate>, - linearColumnFilter_caller<30, T, D, BrdColReplicate>, - linearColumnFilter_caller<31, T, D, BrdColReplicate>, - linearColumnFilter_caller<32, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColConstant>, - linearColumnFilter_caller< 2, T, D, BrdColConstant>, - linearColumnFilter_caller< 3, T, D, BrdColConstant>, - linearColumnFilter_caller< 4, T, D, BrdColConstant>, - linearColumnFilter_caller< 5, T, D, BrdColConstant>, - linearColumnFilter_caller< 6, T, D, BrdColConstant>, - linearColumnFilter_caller< 7, T, D, BrdColConstant>, - linearColumnFilter_caller< 8, T, D, BrdColConstant>, - linearColumnFilter_caller< 9, T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant>, - linearColumnFilter_caller<17, T, D, BrdColConstant>, - linearColumnFilter_caller<18, T, D, BrdColConstant>, - linearColumnFilter_caller<19, T, D, BrdColConstant>, - linearColumnFilter_caller<20, T, D, BrdColConstant>, - linearColumnFilter_caller<21, T, D, BrdColConstant>, - linearColumnFilter_caller<22, T, D, BrdColConstant>, - linearColumnFilter_caller<23, T, D, BrdColConstant>, - linearColumnFilter_caller<24, T, D, BrdColConstant>, - linearColumnFilter_caller<25, T, D, BrdColConstant>, - linearColumnFilter_caller<26, T, D, BrdColConstant>, - linearColumnFilter_caller<27, T, D, BrdColConstant>, - linearColumnFilter_caller<28, T, D, BrdColConstant>, - linearColumnFilter_caller<29, T, D, BrdColConstant>, - linearColumnFilter_caller<30, T, D, BrdColConstant>, - linearColumnFilter_caller<31, T, D, BrdColConstant>, - linearColumnFilter_caller<32, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect>, - linearColumnFilter_caller< 2, T, D, BrdColReflect>, - linearColumnFilter_caller< 3, T, D, BrdColReflect>, - linearColumnFilter_caller< 4, T, D, BrdColReflect>, - linearColumnFilter_caller< 5, T, D, BrdColReflect>, - linearColumnFilter_caller< 6, T, D, BrdColReflect>, - linearColumnFilter_caller< 7, T, D, BrdColReflect>, - linearColumnFilter_caller< 8, T, D, BrdColReflect>, - linearColumnFilter_caller< 9, T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect>, - linearColumnFilter_caller<17, T, D, BrdColReflect>, - linearColumnFilter_caller<18, T, D, BrdColReflect>, - linearColumnFilter_caller<19, T, D, BrdColReflect>, - linearColumnFilter_caller<20, T, D, BrdColReflect>, - linearColumnFilter_caller<21, T, D, BrdColReflect>, - linearColumnFilter_caller<22, T, D, BrdColReflect>, - linearColumnFilter_caller<23, T, D, BrdColReflect>, - linearColumnFilter_caller<24, T, D, BrdColReflect>, - linearColumnFilter_caller<25, T, D, BrdColReflect>, - linearColumnFilter_caller<26, T, D, BrdColReflect>, - linearColumnFilter_caller<27, T, D, BrdColReflect>, - linearColumnFilter_caller<28, T, D, BrdColReflect>, - linearColumnFilter_caller<29, T, D, BrdColReflect>, - linearColumnFilter_caller<30, T, D, BrdColReflect>, - linearColumnFilter_caller<31, T, D, BrdColReflect>, - linearColumnFilter_caller<32, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColWrap>, - linearColumnFilter_caller< 2, T, D, BrdColWrap>, - linearColumnFilter_caller< 3, T, D, BrdColWrap>, - linearColumnFilter_caller< 4, T, D, BrdColWrap>, - linearColumnFilter_caller< 5, T, D, BrdColWrap>, - linearColumnFilter_caller< 6, T, D, BrdColWrap>, - linearColumnFilter_caller< 7, T, D, BrdColWrap>, - linearColumnFilter_caller< 8, T, D, BrdColWrap>, - linearColumnFilter_caller< 9, T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - linearColumnFilter_caller<17, T, D, BrdColWrap>, - linearColumnFilter_caller<18, T, D, BrdColWrap>, - linearColumnFilter_caller<19, T, D, BrdColWrap>, - linearColumnFilter_caller<20, T, D, BrdColWrap>, - linearColumnFilter_caller<21, T, D, BrdColWrap>, - linearColumnFilter_caller<22, T, D, BrdColWrap>, - linearColumnFilter_caller<23, T, D, BrdColWrap>, - linearColumnFilter_caller<24, T, D, BrdColWrap>, - linearColumnFilter_caller<25, T, D, BrdColWrap>, - linearColumnFilter_caller<26, T, D, BrdColWrap>, - linearColumnFilter_caller<27, T, D, BrdColWrap>, - linearColumnFilter_caller<28, T, D, BrdColWrap>, - linearColumnFilter_caller<29, T, D, BrdColWrap>, - linearColumnFilter_caller<30, T, D, BrdColWrap>, - linearColumnFilter_caller<31, T, D, BrdColWrap>, - linearColumnFilter_caller<32, T, D, BrdColWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); - } - - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace column_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h new file mode 100644 index 0000000..dbcd09f --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.h @@ -0,0 +1,378 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template + __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; + #else + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 2; + const int HALO_SIZE = 2; + #endif + + typedef typename TypeVec::cn>::vec_type sum_t; + + __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; + + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + + if (x >= src.cols) + return; + + const T* src_col = src.ptr() + x; + + const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; + + if (blockIdx.y > 0) + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); + } + else + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + } + + if (blockIdx.y + 2 < gridDim.y) + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); + } + else + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int y = yStart + j * BLOCK_DIM_Y; + + if (y < src.rows) + { + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); + } + } + } + + template class B> + void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 16; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 2; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); + + B brd(src.rows); + + linearColumnFilter<<>>(src, dst, anchor, brd); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdColReflect101>, + ::caller< 2, T, D, BrdColReflect101>, + ::caller< 3, T, D, BrdColReflect101>, + ::caller< 4, T, D, BrdColReflect101>, + ::caller< 5, T, D, BrdColReflect101>, + ::caller< 6, T, D, BrdColReflect101>, + ::caller< 7, T, D, BrdColReflect101>, + ::caller< 8, T, D, BrdColReflect101>, + ::caller< 9, T, D, BrdColReflect101>, + ::caller<10, T, D, BrdColReflect101>, + ::caller<11, T, D, BrdColReflect101>, + ::caller<12, T, D, BrdColReflect101>, + ::caller<13, T, D, BrdColReflect101>, + ::caller<14, T, D, BrdColReflect101>, + ::caller<15, T, D, BrdColReflect101>, + ::caller<16, T, D, BrdColReflect101>, + ::caller<17, T, D, BrdColReflect101>, + ::caller<18, T, D, BrdColReflect101>, + ::caller<19, T, D, BrdColReflect101>, + ::caller<20, T, D, BrdColReflect101>, + ::caller<21, T, D, BrdColReflect101>, + ::caller<22, T, D, BrdColReflect101>, + ::caller<23, T, D, BrdColReflect101>, + ::caller<24, T, D, BrdColReflect101>, + ::caller<25, T, D, BrdColReflect101>, + ::caller<26, T, D, BrdColReflect101>, + ::caller<27, T, D, BrdColReflect101>, + ::caller<28, T, D, BrdColReflect101>, + ::caller<29, T, D, BrdColReflect101>, + ::caller<30, T, D, BrdColReflect101>, + ::caller<31, T, D, BrdColReflect101>, + ::caller<32, T, D, BrdColReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdColReplicate>, + ::caller< 2, T, D, BrdColReplicate>, + ::caller< 3, T, D, BrdColReplicate>, + ::caller< 4, T, D, BrdColReplicate>, + ::caller< 5, T, D, BrdColReplicate>, + ::caller< 6, T, D, BrdColReplicate>, + ::caller< 7, T, D, BrdColReplicate>, + ::caller< 8, T, D, BrdColReplicate>, + ::caller< 9, T, D, BrdColReplicate>, + ::caller<10, T, D, BrdColReplicate>, + ::caller<11, T, D, BrdColReplicate>, + ::caller<12, T, D, BrdColReplicate>, + ::caller<13, T, D, BrdColReplicate>, + ::caller<14, T, D, BrdColReplicate>, + ::caller<15, T, D, BrdColReplicate>, + ::caller<16, T, D, BrdColReplicate>, + ::caller<17, T, D, BrdColReplicate>, + ::caller<18, T, D, BrdColReplicate>, + ::caller<19, T, D, BrdColReplicate>, + ::caller<20, T, D, BrdColReplicate>, + ::caller<21, T, D, BrdColReplicate>, + ::caller<22, T, D, BrdColReplicate>, + ::caller<23, T, D, BrdColReplicate>, + ::caller<24, T, D, BrdColReplicate>, + ::caller<25, T, D, BrdColReplicate>, + ::caller<26, T, D, BrdColReplicate>, + ::caller<27, T, D, BrdColReplicate>, + ::caller<28, T, D, BrdColReplicate>, + ::caller<29, T, D, BrdColReplicate>, + ::caller<30, T, D, BrdColReplicate>, + ::caller<31, T, D, BrdColReplicate>, + ::caller<32, T, D, BrdColReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdColConstant>, + ::caller< 2, T, D, BrdColConstant>, + ::caller< 3, T, D, BrdColConstant>, + ::caller< 4, T, D, BrdColConstant>, + ::caller< 5, T, D, BrdColConstant>, + ::caller< 6, T, D, BrdColConstant>, + ::caller< 7, T, D, BrdColConstant>, + ::caller< 8, T, D, BrdColConstant>, + ::caller< 9, T, D, BrdColConstant>, + ::caller<10, T, D, BrdColConstant>, + ::caller<11, T, D, BrdColConstant>, + ::caller<12, T, D, BrdColConstant>, + ::caller<13, T, D, BrdColConstant>, + ::caller<14, T, D, BrdColConstant>, + ::caller<15, T, D, BrdColConstant>, + ::caller<16, T, D, BrdColConstant>, + ::caller<17, T, D, BrdColConstant>, + ::caller<18, T, D, BrdColConstant>, + ::caller<19, T, D, BrdColConstant>, + ::caller<20, T, D, BrdColConstant>, + ::caller<21, T, D, BrdColConstant>, + ::caller<22, T, D, BrdColConstant>, + ::caller<23, T, D, BrdColConstant>, + ::caller<24, T, D, BrdColConstant>, + ::caller<25, T, D, BrdColConstant>, + ::caller<26, T, D, BrdColConstant>, + ::caller<27, T, D, BrdColConstant>, + ::caller<28, T, D, BrdColConstant>, + ::caller<29, T, D, BrdColConstant>, + ::caller<30, T, D, BrdColConstant>, + ::caller<31, T, D, BrdColConstant>, + ::caller<32, T, D, BrdColConstant> + }, + { + 0, + ::caller< 1, T, D, BrdColReflect>, + ::caller< 2, T, D, BrdColReflect>, + ::caller< 3, T, D, BrdColReflect>, + ::caller< 4, T, D, BrdColReflect>, + ::caller< 5, T, D, BrdColReflect>, + ::caller< 6, T, D, BrdColReflect>, + ::caller< 7, T, D, BrdColReflect>, + ::caller< 8, T, D, BrdColReflect>, + ::caller< 9, T, D, BrdColReflect>, + ::caller<10, T, D, BrdColReflect>, + ::caller<11, T, D, BrdColReflect>, + ::caller<12, T, D, BrdColReflect>, + ::caller<13, T, D, BrdColReflect>, + ::caller<14, T, D, BrdColReflect>, + ::caller<15, T, D, BrdColReflect>, + ::caller<16, T, D, BrdColReflect>, + ::caller<17, T, D, BrdColReflect>, + ::caller<18, T, D, BrdColReflect>, + ::caller<19, T, D, BrdColReflect>, + ::caller<20, T, D, BrdColReflect>, + ::caller<21, T, D, BrdColReflect>, + ::caller<22, T, D, BrdColReflect>, + ::caller<23, T, D, BrdColReflect>, + ::caller<24, T, D, BrdColReflect>, + ::caller<25, T, D, BrdColReflect>, + ::caller<26, T, D, BrdColReflect>, + ::caller<27, T, D, BrdColReflect>, + ::caller<28, T, D, BrdColReflect>, + ::caller<29, T, D, BrdColReflect>, + ::caller<30, T, D, BrdColReflect>, + ::caller<31, T, D, BrdColReflect>, + ::caller<32, T, D, BrdColReflect> + }, + { + 0, + ::caller< 1, T, D, BrdColWrap>, + ::caller< 2, T, D, BrdColWrap>, + ::caller< 3, T, D, BrdColWrap>, + ::caller< 4, T, D, BrdColWrap>, + ::caller< 5, T, D, BrdColWrap>, + ::caller< 6, T, D, BrdColWrap>, + ::caller< 7, T, D, BrdColWrap>, + ::caller< 8, T, D, BrdColWrap>, + ::caller< 9, T, D, BrdColWrap>, + ::caller<10, T, D, BrdColWrap>, + ::caller<11, T, D, BrdColWrap>, + ::caller<12, T, D, BrdColWrap>, + ::caller<13, T, D, BrdColWrap>, + ::caller<14, T, D, BrdColWrap>, + ::caller<15, T, D, BrdColWrap>, + ::caller<16, T, D, BrdColWrap>, + ::caller<17, T, D, BrdColWrap>, + ::caller<18, T, D, BrdColWrap>, + ::caller<19, T, D, BrdColWrap>, + ::caller<20, T, D, BrdColWrap>, + ::caller<21, T, D, BrdColWrap>, + ::caller<22, T, D, BrdColWrap>, + ::caller<23, T, D, BrdColWrap>, + ::caller<24, T, D, BrdColWrap>, + ::caller<25, T, D, BrdColWrap>, + ::caller<26, T, D, BrdColWrap>, + ::caller<27, T, D, BrdColWrap>, + ::caller<28, T, D, BrdColWrap>, + ::caller<29, T, D, BrdColWrap>, + ::caller<30, T, D, BrdColWrap>, + ::caller<31, T, D, BrdColWrap>, + ::caller<32, T, D, BrdColWrap> + } + }; + + ::loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/cuda/row_filter.0.cu b/modules/gpu/src/cuda/row_filter.0.cu new file mode 100644 index 0000000..a1a8f36 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.1.cu b/modules/gpu/src/cuda/row_filter.1.cu new file mode 100644 index 0000000..ab2248e --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.2.cu b/modules/gpu/src/cuda/row_filter.2.cu new file mode 100644 index 0000000..5aa2e2b --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.3.cu b/modules/gpu/src/cuda/row_filter.3.cu new file mode 100644 index 0000000..9d131a9 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.4.cu b/modules/gpu/src/cuda/row_filter.4.cu new file mode 100644 index 0000000..0aae534 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.5.cu b/modules/gpu/src/cuda/row_filter.5.cu new file mode 100644 index 0000000..dd1f2be --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.6.cu b/modules/gpu/src/cuda/row_filter.6.cu new file mode 100644 index 0000000..548069d --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.7.cu b/modules/gpu/src/cuda/row_filter.7.cu new file mode 100644 index 0000000..8c5c09e --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu deleted file mode 100644 index 39fc53f..0000000 --- a/modules/gpu/src/cuda/row_filter.cu +++ /dev/null @@ -1,390 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace row_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template - __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #else - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 4; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #endif - - typedef typename TypeVec::cn>::vec_type sum_t; - - __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; - - const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - - if (y >= src.rows) - return; - - const T* src_row = src.ptr(y); - - const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; - - if (blockIdx.x > 0) - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); - } - else - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); - } - - if (blockIdx.x + 2 < gridDim.x) - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + j * BLOCK_DIM_X]); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); - } - else - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int x = xStart + j * BLOCK_DIM_X; - - if (x < src.cols) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); - } - } - } - - template class B> - void linearRowFilter_caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 4; - PATCH_PER_BLOCK = 4; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); - - B brd(src.cols); - - linearRowFilter<<>>(src, dst, anchor, brd); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect101>, - linearRowFilter_caller< 2, T, D, BrdRowReflect101>, - linearRowFilter_caller< 3, T, D, BrdRowReflect101>, - linearRowFilter_caller< 4, T, D, BrdRowReflect101>, - linearRowFilter_caller< 5, T, D, BrdRowReflect101>, - linearRowFilter_caller< 6, T, D, BrdRowReflect101>, - linearRowFilter_caller< 7, T, D, BrdRowReflect101>, - linearRowFilter_caller< 8, T, D, BrdRowReflect101>, - linearRowFilter_caller< 9, T, D, BrdRowReflect101>, - linearRowFilter_caller<10, T, D, BrdRowReflect101>, - linearRowFilter_caller<11, T, D, BrdRowReflect101>, - linearRowFilter_caller<12, T, D, BrdRowReflect101>, - linearRowFilter_caller<13, T, D, BrdRowReflect101>, - linearRowFilter_caller<14, T, D, BrdRowReflect101>, - linearRowFilter_caller<15, T, D, BrdRowReflect101>, - linearRowFilter_caller<16, T, D, BrdRowReflect101>, - linearRowFilter_caller<17, T, D, BrdRowReflect101>, - linearRowFilter_caller<18, T, D, BrdRowReflect101>, - linearRowFilter_caller<19, T, D, BrdRowReflect101>, - linearRowFilter_caller<20, T, D, BrdRowReflect101>, - linearRowFilter_caller<21, T, D, BrdRowReflect101>, - linearRowFilter_caller<22, T, D, BrdRowReflect101>, - linearRowFilter_caller<23, T, D, BrdRowReflect101>, - linearRowFilter_caller<24, T, D, BrdRowReflect101>, - linearRowFilter_caller<25, T, D, BrdRowReflect101>, - linearRowFilter_caller<26, T, D, BrdRowReflect101>, - linearRowFilter_caller<27, T, D, BrdRowReflect101>, - linearRowFilter_caller<28, T, D, BrdRowReflect101>, - linearRowFilter_caller<29, T, D, BrdRowReflect101>, - linearRowFilter_caller<30, T, D, BrdRowReflect101>, - linearRowFilter_caller<31, T, D, BrdRowReflect101>, - linearRowFilter_caller<32, T, D, BrdRowReflect101> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReplicate>, - linearRowFilter_caller< 2, T, D, BrdRowReplicate>, - linearRowFilter_caller< 3, T, D, BrdRowReplicate>, - linearRowFilter_caller< 4, T, D, BrdRowReplicate>, - linearRowFilter_caller< 5, T, D, BrdRowReplicate>, - linearRowFilter_caller< 6, T, D, BrdRowReplicate>, - linearRowFilter_caller< 7, T, D, BrdRowReplicate>, - linearRowFilter_caller< 8, T, D, BrdRowReplicate>, - linearRowFilter_caller< 9, T, D, BrdRowReplicate>, - linearRowFilter_caller<10, T, D, BrdRowReplicate>, - linearRowFilter_caller<11, T, D, BrdRowReplicate>, - linearRowFilter_caller<12, T, D, BrdRowReplicate>, - linearRowFilter_caller<13, T, D, BrdRowReplicate>, - linearRowFilter_caller<14, T, D, BrdRowReplicate>, - linearRowFilter_caller<15, T, D, BrdRowReplicate>, - linearRowFilter_caller<16, T, D, BrdRowReplicate>, - linearRowFilter_caller<17, T, D, BrdRowReplicate>, - linearRowFilter_caller<18, T, D, BrdRowReplicate>, - linearRowFilter_caller<19, T, D, BrdRowReplicate>, - linearRowFilter_caller<20, T, D, BrdRowReplicate>, - linearRowFilter_caller<21, T, D, BrdRowReplicate>, - linearRowFilter_caller<22, T, D, BrdRowReplicate>, - linearRowFilter_caller<23, T, D, BrdRowReplicate>, - linearRowFilter_caller<24, T, D, BrdRowReplicate>, - linearRowFilter_caller<25, T, D, BrdRowReplicate>, - linearRowFilter_caller<26, T, D, BrdRowReplicate>, - linearRowFilter_caller<27, T, D, BrdRowReplicate>, - linearRowFilter_caller<28, T, D, BrdRowReplicate>, - linearRowFilter_caller<29, T, D, BrdRowReplicate>, - linearRowFilter_caller<30, T, D, BrdRowReplicate>, - linearRowFilter_caller<31, T, D, BrdRowReplicate>, - linearRowFilter_caller<32, T, D, BrdRowReplicate> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowConstant>, - linearRowFilter_caller< 2, T, D, BrdRowConstant>, - linearRowFilter_caller< 3, T, D, BrdRowConstant>, - linearRowFilter_caller< 4, T, D, BrdRowConstant>, - linearRowFilter_caller< 5, T, D, BrdRowConstant>, - linearRowFilter_caller< 6, T, D, BrdRowConstant>, - linearRowFilter_caller< 7, T, D, BrdRowConstant>, - linearRowFilter_caller< 8, T, D, BrdRowConstant>, - linearRowFilter_caller< 9, T, D, BrdRowConstant>, - linearRowFilter_caller<10, T, D, BrdRowConstant>, - linearRowFilter_caller<11, T, D, BrdRowConstant>, - linearRowFilter_caller<12, T, D, BrdRowConstant>, - linearRowFilter_caller<13, T, D, BrdRowConstant>, - linearRowFilter_caller<14, T, D, BrdRowConstant>, - linearRowFilter_caller<15, T, D, BrdRowConstant>, - linearRowFilter_caller<16, T, D, BrdRowConstant>, - linearRowFilter_caller<17, T, D, BrdRowConstant>, - linearRowFilter_caller<18, T, D, BrdRowConstant>, - linearRowFilter_caller<19, T, D, BrdRowConstant>, - linearRowFilter_caller<20, T, D, BrdRowConstant>, - linearRowFilter_caller<21, T, D, BrdRowConstant>, - linearRowFilter_caller<22, T, D, BrdRowConstant>, - linearRowFilter_caller<23, T, D, BrdRowConstant>, - linearRowFilter_caller<24, T, D, BrdRowConstant>, - linearRowFilter_caller<25, T, D, BrdRowConstant>, - linearRowFilter_caller<26, T, D, BrdRowConstant>, - linearRowFilter_caller<27, T, D, BrdRowConstant>, - linearRowFilter_caller<28, T, D, BrdRowConstant>, - linearRowFilter_caller<29, T, D, BrdRowConstant>, - linearRowFilter_caller<30, T, D, BrdRowConstant>, - linearRowFilter_caller<31, T, D, BrdRowConstant>, - linearRowFilter_caller<32, T, D, BrdRowConstant> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect>, - linearRowFilter_caller< 2, T, D, BrdRowReflect>, - linearRowFilter_caller< 3, T, D, BrdRowReflect>, - linearRowFilter_caller< 4, T, D, BrdRowReflect>, - linearRowFilter_caller< 5, T, D, BrdRowReflect>, - linearRowFilter_caller< 6, T, D, BrdRowReflect>, - linearRowFilter_caller< 7, T, D, BrdRowReflect>, - linearRowFilter_caller< 8, T, D, BrdRowReflect>, - linearRowFilter_caller< 9, T, D, BrdRowReflect>, - linearRowFilter_caller<10, T, D, BrdRowReflect>, - linearRowFilter_caller<11, T, D, BrdRowReflect>, - linearRowFilter_caller<12, T, D, BrdRowReflect>, - linearRowFilter_caller<13, T, D, BrdRowReflect>, - linearRowFilter_caller<14, T, D, BrdRowReflect>, - linearRowFilter_caller<15, T, D, BrdRowReflect>, - linearRowFilter_caller<16, T, D, BrdRowReflect>, - linearRowFilter_caller<17, T, D, BrdRowReflect>, - linearRowFilter_caller<18, T, D, BrdRowReflect>, - linearRowFilter_caller<19, T, D, BrdRowReflect>, - linearRowFilter_caller<20, T, D, BrdRowReflect>, - linearRowFilter_caller<21, T, D, BrdRowReflect>, - linearRowFilter_caller<22, T, D, BrdRowReflect>, - linearRowFilter_caller<23, T, D, BrdRowReflect>, - linearRowFilter_caller<24, T, D, BrdRowReflect>, - linearRowFilter_caller<25, T, D, BrdRowReflect>, - linearRowFilter_caller<26, T, D, BrdRowReflect>, - linearRowFilter_caller<27, T, D, BrdRowReflect>, - linearRowFilter_caller<28, T, D, BrdRowReflect>, - linearRowFilter_caller<29, T, D, BrdRowReflect>, - linearRowFilter_caller<30, T, D, BrdRowReflect>, - linearRowFilter_caller<31, T, D, BrdRowReflect>, - linearRowFilter_caller<32, T, D, BrdRowReflect> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowWrap>, - linearRowFilter_caller< 2, T, D, BrdRowWrap>, - linearRowFilter_caller< 3, T, D, BrdRowWrap>, - linearRowFilter_caller< 4, T, D, BrdRowWrap>, - linearRowFilter_caller< 5, T, D, BrdRowWrap>, - linearRowFilter_caller< 6, T, D, BrdRowWrap>, - linearRowFilter_caller< 7, T, D, BrdRowWrap>, - linearRowFilter_caller< 8, T, D, BrdRowWrap>, - linearRowFilter_caller< 9, T, D, BrdRowWrap>, - linearRowFilter_caller<10, T, D, BrdRowWrap>, - linearRowFilter_caller<11, T, D, BrdRowWrap>, - linearRowFilter_caller<12, T, D, BrdRowWrap>, - linearRowFilter_caller<13, T, D, BrdRowWrap>, - linearRowFilter_caller<14, T, D, BrdRowWrap>, - linearRowFilter_caller<15, T, D, BrdRowWrap>, - linearRowFilter_caller<16, T, D, BrdRowWrap>, - linearRowFilter_caller<17, T, D, BrdRowWrap>, - linearRowFilter_caller<18, T, D, BrdRowWrap>, - linearRowFilter_caller<19, T, D, BrdRowWrap>, - linearRowFilter_caller<20, T, D, BrdRowWrap>, - linearRowFilter_caller<21, T, D, BrdRowWrap>, - linearRowFilter_caller<22, T, D, BrdRowWrap>, - linearRowFilter_caller<23, T, D, BrdRowWrap>, - linearRowFilter_caller<24, T, D, BrdRowWrap>, - linearRowFilter_caller<25, T, D, BrdRowWrap>, - linearRowFilter_caller<26, T, D, BrdRowWrap>, - linearRowFilter_caller<27, T, D, BrdRowWrap>, - linearRowFilter_caller<28, T, D, BrdRowWrap>, - linearRowFilter_caller<29, T, D, BrdRowWrap>, - linearRowFilter_caller<30, T, D, BrdRowWrap>, - linearRowFilter_caller<31, T, D, BrdRowWrap>, - linearRowFilter_caller<32, T, D, BrdRowWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); - } - - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace row_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.h b/modules/gpu/src/cuda/row_filter.h new file mode 100644 index 0000000..0da2dfe --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.h @@ -0,0 +1,377 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template + __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #else + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 4; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #endif + + typedef typename TypeVec::cn>::vec_type sum_t; + + __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; + + const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + + if (y >= src.rows) + return; + + const T* src_row = src.ptr(y); + + const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; + + if (blockIdx.x > 0) + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); + } + else + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); + } + + if (blockIdx.x + 2 < gridDim.x) + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + j * BLOCK_DIM_X]); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); + } + else + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int x = xStart + j * BLOCK_DIM_X; + + if (x < src.cols) + { + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); + } + } + } + + template class B> + void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 4; + PATCH_PER_BLOCK = 4; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); + + B brd(src.cols); + + linearRowFilter<<>>(src, dst, anchor, brd); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdRowReflect101>, + ::caller< 2, T, D, BrdRowReflect101>, + ::caller< 3, T, D, BrdRowReflect101>, + ::caller< 4, T, D, BrdRowReflect101>, + ::caller< 5, T, D, BrdRowReflect101>, + ::caller< 6, T, D, BrdRowReflect101>, + ::caller< 7, T, D, BrdRowReflect101>, + ::caller< 8, T, D, BrdRowReflect101>, + ::caller< 9, T, D, BrdRowReflect101>, + ::caller<10, T, D, BrdRowReflect101>, + ::caller<11, T, D, BrdRowReflect101>, + ::caller<12, T, D, BrdRowReflect101>, + ::caller<13, T, D, BrdRowReflect101>, + ::caller<14, T, D, BrdRowReflect101>, + ::caller<15, T, D, BrdRowReflect101>, + ::caller<16, T, D, BrdRowReflect101>, + ::caller<17, T, D, BrdRowReflect101>, + ::caller<18, T, D, BrdRowReflect101>, + ::caller<19, T, D, BrdRowReflect101>, + ::caller<20, T, D, BrdRowReflect101>, + ::caller<21, T, D, BrdRowReflect101>, + ::caller<22, T, D, BrdRowReflect101>, + ::caller<23, T, D, BrdRowReflect101>, + ::caller<24, T, D, BrdRowReflect101>, + ::caller<25, T, D, BrdRowReflect101>, + ::caller<26, T, D, BrdRowReflect101>, + ::caller<27, T, D, BrdRowReflect101>, + ::caller<28, T, D, BrdRowReflect101>, + ::caller<29, T, D, BrdRowReflect101>, + ::caller<30, T, D, BrdRowReflect101>, + ::caller<31, T, D, BrdRowReflect101>, + ::caller<32, T, D, BrdRowReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdRowReplicate>, + ::caller< 2, T, D, BrdRowReplicate>, + ::caller< 3, T, D, BrdRowReplicate>, + ::caller< 4, T, D, BrdRowReplicate>, + ::caller< 5, T, D, BrdRowReplicate>, + ::caller< 6, T, D, BrdRowReplicate>, + ::caller< 7, T, D, BrdRowReplicate>, + ::caller< 8, T, D, BrdRowReplicate>, + ::caller< 9, T, D, BrdRowReplicate>, + ::caller<10, T, D, BrdRowReplicate>, + ::caller<11, T, D, BrdRowReplicate>, + ::caller<12, T, D, BrdRowReplicate>, + ::caller<13, T, D, BrdRowReplicate>, + ::caller<14, T, D, BrdRowReplicate>, + ::caller<15, T, D, BrdRowReplicate>, + ::caller<16, T, D, BrdRowReplicate>, + ::caller<17, T, D, BrdRowReplicate>, + ::caller<18, T, D, BrdRowReplicate>, + ::caller<19, T, D, BrdRowReplicate>, + ::caller<20, T, D, BrdRowReplicate>, + ::caller<21, T, D, BrdRowReplicate>, + ::caller<22, T, D, BrdRowReplicate>, + ::caller<23, T, D, BrdRowReplicate>, + ::caller<24, T, D, BrdRowReplicate>, + ::caller<25, T, D, BrdRowReplicate>, + ::caller<26, T, D, BrdRowReplicate>, + ::caller<27, T, D, BrdRowReplicate>, + ::caller<28, T, D, BrdRowReplicate>, + ::caller<29, T, D, BrdRowReplicate>, + ::caller<30, T, D, BrdRowReplicate>, + ::caller<31, T, D, BrdRowReplicate>, + ::caller<32, T, D, BrdRowReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdRowConstant>, + ::caller< 2, T, D, BrdRowConstant>, + ::caller< 3, T, D, BrdRowConstant>, + ::caller< 4, T, D, BrdRowConstant>, + ::caller< 5, T, D, BrdRowConstant>, + ::caller< 6, T, D, BrdRowConstant>, + ::caller< 7, T, D, BrdRowConstant>, + ::caller< 8, T, D, BrdRowConstant>, + ::caller< 9, T, D, BrdRowConstant>, + ::caller<10, T, D, BrdRowConstant>, + ::caller<11, T, D, BrdRowConstant>, + ::caller<12, T, D, BrdRowConstant>, + ::caller<13, T, D, BrdRowConstant>, + ::caller<14, T, D, BrdRowConstant>, + ::caller<15, T, D, BrdRowConstant>, + ::caller<16, T, D, BrdRowConstant>, + ::caller<17, T, D, BrdRowConstant>, + ::caller<18, T, D, BrdRowConstant>, + ::caller<19, T, D, BrdRowConstant>, + ::caller<20, T, D, BrdRowConstant>, + ::caller<21, T, D, BrdRowConstant>, + ::caller<22, T, D, BrdRowConstant>, + ::caller<23, T, D, BrdRowConstant>, + ::caller<24, T, D, BrdRowConstant>, + ::caller<25, T, D, BrdRowConstant>, + ::caller<26, T, D, BrdRowConstant>, + ::caller<27, T, D, BrdRowConstant>, + ::caller<28, T, D, BrdRowConstant>, + ::caller<29, T, D, BrdRowConstant>, + ::caller<30, T, D, BrdRowConstant>, + ::caller<31, T, D, BrdRowConstant>, + ::caller<32, T, D, BrdRowConstant> + }, + { + 0, + ::caller< 1, T, D, BrdRowReflect>, + ::caller< 2, T, D, BrdRowReflect>, + ::caller< 3, T, D, BrdRowReflect>, + ::caller< 4, T, D, BrdRowReflect>, + ::caller< 5, T, D, BrdRowReflect>, + ::caller< 6, T, D, BrdRowReflect>, + ::caller< 7, T, D, BrdRowReflect>, + ::caller< 8, T, D, BrdRowReflect>, + ::caller< 9, T, D, BrdRowReflect>, + ::caller<10, T, D, BrdRowReflect>, + ::caller<11, T, D, BrdRowReflect>, + ::caller<12, T, D, BrdRowReflect>, + ::caller<13, T, D, BrdRowReflect>, + ::caller<14, T, D, BrdRowReflect>, + ::caller<15, T, D, BrdRowReflect>, + ::caller<16, T, D, BrdRowReflect>, + ::caller<17, T, D, BrdRowReflect>, + ::caller<18, T, D, BrdRowReflect>, + ::caller<19, T, D, BrdRowReflect>, + ::caller<20, T, D, BrdRowReflect>, + ::caller<21, T, D, BrdRowReflect>, + ::caller<22, T, D, BrdRowReflect>, + ::caller<23, T, D, BrdRowReflect>, + ::caller<24, T, D, BrdRowReflect>, + ::caller<25, T, D, BrdRowReflect>, + ::caller<26, T, D, BrdRowReflect>, + ::caller<27, T, D, BrdRowReflect>, + ::caller<28, T, D, BrdRowReflect>, + ::caller<29, T, D, BrdRowReflect>, + ::caller<30, T, D, BrdRowReflect>, + ::caller<31, T, D, BrdRowReflect>, + ::caller<32, T, D, BrdRowReflect> + }, + { + 0, + ::caller< 1, T, D, BrdRowWrap>, + ::caller< 2, T, D, BrdRowWrap>, + ::caller< 3, T, D, BrdRowWrap>, + ::caller< 4, T, D, BrdRowWrap>, + ::caller< 5, T, D, BrdRowWrap>, + ::caller< 6, T, D, BrdRowWrap>, + ::caller< 7, T, D, BrdRowWrap>, + ::caller< 8, T, D, BrdRowWrap>, + ::caller< 9, T, D, BrdRowWrap>, + ::caller<10, T, D, BrdRowWrap>, + ::caller<11, T, D, BrdRowWrap>, + ::caller<12, T, D, BrdRowWrap>, + ::caller<13, T, D, BrdRowWrap>, + ::caller<14, T, D, BrdRowWrap>, + ::caller<15, T, D, BrdRowWrap>, + ::caller<16, T, D, BrdRowWrap>, + ::caller<17, T, D, BrdRowWrap>, + ::caller<18, T, D, BrdRowWrap>, + ::caller<19, T, D, BrdRowWrap>, + ::caller<20, T, D, BrdRowWrap>, + ::caller<21, T, D, BrdRowWrap>, + ::caller<22, T, D, BrdRowWrap>, + ::caller<23, T, D, BrdRowWrap>, + ::caller<24, T, D, BrdRowWrap>, + ::caller<25, T, D, BrdRowWrap>, + ::caller<26, T, D, BrdRowWrap>, + ::caller<27, T, D, BrdRowWrap>, + ::caller<28, T, D, BrdRowWrap>, + ::caller<29, T, D, BrdRowWrap>, + ::caller<30, T, D, BrdRowWrap>, + ::caller<31, T, D, BrdRowWrap>, + ::caller<32, T, D, BrdRowWrap> + } + }; + + loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 77ed46e..6b7135a 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -830,20 +830,14 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke //////////////////////////////////////////////////////////////////////////////////////////////////// // Separable Linear Filter -namespace cv { namespace gpu { namespace device +namespace filter { - namespace row_filter - { - template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } + template + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - namespace column_filter - { - template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } -}}} + template + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} namespace { @@ -899,8 +893,6 @@ namespace Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::row_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4)) @@ -940,28 +932,28 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, switch (srcType) { case CV_8UC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_8UC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_8UC4: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_16SC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32SC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC4: - func = linearRowFilter_gpu; + func = filter::linearRow; break; } @@ -1020,8 +1012,6 @@ namespace Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::column_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4)) @@ -1061,28 +1051,28 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds switch (dstType) { case CV_8UC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_8UC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_8UC4: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_16SC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32SC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC4: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; } -- 2.7.4