From: Vladislav Vinogradov <no@email>
Date: Wed, 15 Feb 2012 19:25:29 +0000 (+0000)
Subject: fixed compilation for old compute capabilities
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~5525
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ada6ab3778b1391192d84688caad6c388becf46c;p=platform%2Fupstream%2Fopencv.git

fixed compilation for old compute capabilities
---

diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
index 36dd7bb..d00bec8 100644
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
         }
 
-        template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B>
+        template <int KSIZE, typename T, typename D, typename B>
         __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
         {
-            Static<KSIZE <= MAX_KERNEL_SIZE>::check();
-            Static<HALO_SIZE * BLOCK_DIM_Y >= KSIZE>::check();
-            Static<VecTraits<T>::cn == VecTraits<D>::cn>::check();
+            #if __CUDA_ARCH__ >= 200
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 16;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+            #else
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 2;
+                const int HALO_SIZE = 2;
+            #endif
 
             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
 
@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device
             {
                 const int y = yStart + j * BLOCK_DIM_Y;
 
-                if (y >= src.rows)
-                    return;
-
-                sum_t sum = VecTraits<sum_t>::all(0);
+                if (y < src.rows)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
 
-                #pragma unroll
-                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
 
-                dst(y, x) = saturate_cast<D>(sum);
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
             }
         }
 
         template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream)
+        void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
         {
-            const int BLOCK_DIM_X = 16;
-            const int BLOCK_DIM_Y = 16;
-            const int PATCH_PER_BLOCK = 4;
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 16;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 2;
+            }
 
             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
             const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
             
             B<T> brd(src.rows);
 
-            linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, KSIZE <= 16 ? 1 : 2, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
 
             cudaSafeCall( cudaGetLastError() );
 
@@ -137,9 +158,9 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T, typename D>
-        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
         {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream);
+            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
 
             static const caller_t callers[5][33] = 
             {
@@ -322,13 +343,13 @@ namespace cv { namespace gpu { namespace device
             
             loadKernel(kernel, ksize);
 
-            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
         }
 
-        template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
     } // namespace column_filter
 }}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
index b252b21..a5ec886 100644
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
         }
 
-        template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B>
+        template <int KSIZE, typename T, typename D, typename B>
         __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
         {
-            Static<KSIZE <= MAX_KERNEL_SIZE>::check();
-            Static<HALO_SIZE * BLOCK_DIM_X >= KSIZE>::check();
-            Static<VecTraits<T>::cn == VecTraits<D>::cn>::check();
+            #if __CUDA_ARCH__ >= 200
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #else
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 4;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #endif
 
             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
 
@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device
             {
                 const int x = xStart + j * BLOCK_DIM_X;
 
-                if (x >= src.cols)
-                    return;
-
-                sum_t sum = VecTraits<sum_t>::all(0);
+                if (x < src.cols)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
 
-                #pragma unroll
-                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
 
-                dst(y, x) = saturate_cast<D>(sum);
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
             }
         }
 
         template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream)
+        void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
         {
-            const int BLOCK_DIM_X = 32;
-            const int BLOCK_DIM_Y = 8;
-            const int PATCH_PER_BLOCK = 4;
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 4;
+                PATCH_PER_BLOCK = 4;
+            }
 
             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
             const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
 
             B<T> brd(src.cols);
 
-            linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, 1, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
@@ -136,9 +157,9 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T, typename D>
-        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
         {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream);
+            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
 
             static const caller_t callers[5][33] = 
             {
@@ -321,13 +342,13 @@ namespace cv { namespace gpu { namespace device
             
             loadKernel(kernel, ksize);
 
-            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
         }
 
-        template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
     } // namespace row_filter
 }}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index 45e2cd0..42a0a39 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -740,13 +740,13 @@ namespace cv { namespace gpu { namespace device
     namespace row_filter
     {
         template <typename T, typename D>
-        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
     }
 
     namespace column_filter
     {
         template <typename T, typename D>
-        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
     }
 }}}
 
@@ -755,7 +755,7 @@ namespace
     typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
         const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
 
-    typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 
     struct NppLinearRowFilter : public BaseRowFilter_GPU
     {
@@ -791,7 +791,9 @@ namespace
 
         virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
         {
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));
+            DeviceInfo devInfo;
+            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
         }
 
         Mat kernel;
@@ -899,7 +901,10 @@ namespace
 
         virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
         {
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));
+            DeviceInfo devInfo;
+            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+            CV_Assert(cc >= 20 || ksize <= 16);
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
         }
 
         Mat kernel;