cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
}\r
\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B>\r
+ template <int KSIZE, typename T, typename D, typename B>\r
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)\r
{\r
- Static<KSIZE <= MAX_KERNEL_SIZE>::check();\r
- Static<HALO_SIZE * BLOCK_DIM_Y >= KSIZE>::check();\r
- Static<VecTraits<T>::cn == VecTraits<D>::cn>::check();\r
+ #if __CUDA_ARCH__ >= 200\r
+ const int BLOCK_DIM_X = 16;\r
+ const int BLOCK_DIM_Y = 16;\r
+ const int PATCH_PER_BLOCK = 4;\r
+ const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;\r
+ #else\r
+ const int BLOCK_DIM_X = 16;\r
+ const int BLOCK_DIM_Y = 8;\r
+ const int PATCH_PER_BLOCK = 2;\r
+ const int HALO_SIZE = 2;\r
+ #endif\r
\r
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
\r
{\r
const int y = yStart + j * BLOCK_DIM_Y;\r
\r
- if (y >= src.rows)\r
- return;\r
-\r
- sum_t sum = VecTraits<sum_t>::all(0);\r
+ if (y < src.rows)\r
+ {\r
+ sum_t sum = VecTraits<sum_t>::all(0);\r
\r
- #pragma unroll\r
- for (int k = 0; k < KSIZE; ++k)\r
- sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];\r
+ #pragma unroll\r
+ for (int k = 0; k < KSIZE; ++k)\r
+ sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];\r
\r
- dst(y, x) = saturate_cast<D>(sum);\r
+ dst(y, x) = saturate_cast<D>(sum);\r
+ }\r
}\r
}\r
\r
template <int KSIZE, typename T, typename D, template<typename> class B>\r
- void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream)\r
+ void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)\r
{\r
- const int BLOCK_DIM_X = 16;\r
- const int BLOCK_DIM_Y = 16;\r
- const int PATCH_PER_BLOCK = 4;\r
+ int BLOCK_DIM_X;\r
+ int BLOCK_DIM_Y;\r
+ int PATCH_PER_BLOCK;\r
+\r
+ if (cc >= 20)\r
+ {\r
+ BLOCK_DIM_X = 16;\r
+ BLOCK_DIM_Y = 16;\r
+ PATCH_PER_BLOCK = 4;\r
+ }\r
+ else\r
+ {\r
+ BLOCK_DIM_X = 16;\r
+ BLOCK_DIM_Y = 8;\r
+ PATCH_PER_BLOCK = 2;\r
+ }\r
\r
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));\r
\r
B<T> brd(src.rows);\r
\r
- linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, KSIZE <= 16 ? 1 : 2, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
+ linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
\r
cudaSafeCall( cudaGetLastError() );\r
\r
}\r
\r
template <typename T, typename D>\r
- void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+ void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
{\r
- typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream);\r
+ typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);\r
\r
static const caller_t callers[5][33] = \r
{\r
\r
loadKernel(kernel, ksize);\r
\r
- callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
+ callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);\r
}\r
\r
- template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearColumnFilter_gpu<float , int >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float , int >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
} // namespace column_filter\r
}}} // namespace cv { namespace gpu { namespace device\r
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
}\r
\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B>\r
+ template <int KSIZE, typename T, typename D, typename B>\r
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)\r
{\r
- Static<KSIZE <= MAX_KERNEL_SIZE>::check();\r
- Static<HALO_SIZE * BLOCK_DIM_X >= KSIZE>::check();\r
- Static<VecTraits<T>::cn == VecTraits<D>::cn>::check();\r
+ #if __CUDA_ARCH__ >= 200\r
+ const int BLOCK_DIM_X = 32;\r
+ const int BLOCK_DIM_Y = 8;\r
+ const int PATCH_PER_BLOCK = 4;\r
+ const int HALO_SIZE = 1;\r
+ #else\r
+ const int BLOCK_DIM_X = 32;\r
+ const int BLOCK_DIM_Y = 4;\r
+ const int PATCH_PER_BLOCK = 4;\r
+ const int HALO_SIZE = 1;\r
+ #endif\r
\r
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
\r
{\r
const int x = xStart + j * BLOCK_DIM_X;\r
\r
- if (x >= src.cols)\r
- return;\r
-\r
- sum_t sum = VecTraits<sum_t>::all(0);\r
+ if (x < src.cols)\r
+ {\r
+ sum_t sum = VecTraits<sum_t>::all(0);\r
\r
- #pragma unroll\r
- for (int k = 0; k < KSIZE; ++k)\r
- sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];\r
+ #pragma unroll\r
+ for (int k = 0; k < KSIZE; ++k)\r
+ sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];\r
\r
- dst(y, x) = saturate_cast<D>(sum);\r
+ dst(y, x) = saturate_cast<D>(sum);\r
+ }\r
}\r
}\r
\r
template <int KSIZE, typename T, typename D, template<typename> class B>\r
- void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream)\r
+ void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)\r
{\r
- const int BLOCK_DIM_X = 32;\r
- const int BLOCK_DIM_Y = 8;\r
- const int PATCH_PER_BLOCK = 4;\r
+ int BLOCK_DIM_X;\r
+ int BLOCK_DIM_Y;\r
+ int PATCH_PER_BLOCK;\r
+\r
+ if (cc >= 20)\r
+ {\r
+ BLOCK_DIM_X = 32;\r
+ BLOCK_DIM_Y = 8;\r
+ PATCH_PER_BLOCK = 4;\r
+ }\r
+ else\r
+ {\r
+ BLOCK_DIM_X = 32;\r
+ BLOCK_DIM_Y = 4;\r
+ PATCH_PER_BLOCK = 4;\r
+ }\r
\r
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));\r
\r
B<T> brd(src.cols);\r
\r
- linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, 1, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
+ linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
}\r
\r
template <typename T, typename D>\r
- void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+ void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
{\r
- typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream);\r
+ typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);\r
\r
static const caller_t callers[5][33] = \r
{\r
\r
loadKernel(kernel, ksize);\r
\r
- callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
+ callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);\r
}\r
\r
- template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearRowFilter_gpu<int , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
- template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<int , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+ template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
} // namespace row_filter\r
}}} // namespace cv { namespace gpu { namespace device\r
namespace row_filter\r
{\r
template <typename T, typename D>\r
- void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
}\r
\r
namespace column_filter\r
{\r
template <typename T, typename D>\r
- void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
}\r
}}}\r
\r
typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, \r
const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);\r
\r
- typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+ typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
\r
struct NppLinearRowFilter : public BaseRowFilter_GPU\r
{\r
\r
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())\r
{\r
- func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));\r
+ DeviceInfo devInfo;\r
+ int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();\r
+ func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));\r
}\r
\r
Mat kernel;\r
\r
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())\r
{\r
- func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));\r
+ DeviceInfo devInfo;\r
+ int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();\r
+ CV_Assert(cc >= 20 || ksize <= 16);\r
+ func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));\r
}\r
\r
Mat kernel;\r