cuda optflow tvl1 : async safety

author Namgoo Lee <namgoo.lee@cognex.com>

Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)

committer Namgoo Lee <namgoo.lee@cognex.com>

Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)
author Namgoo Lee <namgoo.lee@cognex.com>
Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)
committer Namgoo Lee <namgoo.lee@cognex.com>
Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)
diff --git a/modules/core/include/opencv2/core/cuda/common.hpp b/modules/core/include/opencv2/core/cuda/common.hpp

index 14b1f3f..80b2ff0 100644 (file)
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@@ -101,6 +101,20 @@ namespace cv { namespace cuda
              cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
              cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
          }
+
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
+        {
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
+            resDesc.res.pitch2D.height = img.rows;
+            resDesc.res.pitch2D.width = img.cols;
+            resDesc.res.pitch2D.pitchInBytes = img.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+            cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
+        }
      }
  }}
  
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu

index 4418b8e..253287c 100644 (file)
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -90,53 +90,47 @@ namespace cv { namespace cuda { namespace device
  
  namespace canny
  {
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
      struct SrcTex
      {
+        virtual ~SrcTex() {}
+
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
+
          int xoff;
          int yoff;
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+    };
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTexRef : SrcTex
+    {
+        __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
  
-        __device__ __forceinline__ int operator ()(int y, int x) const
+        __device__ __forceinline__ int operator ()(int y, int x) const override
          {
              return tex2D(tex_src, x + xoff, y + yoff);
          }
      };
  
-    struct SrcTexObject
+    struct SrcTexObj : SrcTex
      {
-        int xoff;
-        int yoff;
-        cudaTextureObject_t tex_src_object;
-        __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
+        __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
  
-        __device__ __forceinline__ int operator ()(int y, int x) const
+        __device__ __forceinline__ int operator ()(int y, int x) const override
          {
              return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
          }
  
+        cudaTextureObject_t tex_src_object;
      };
  
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y >= mag.rows || x >= mag.cols)
-            return;
-
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-
-        dx(y, x) = dxVal;
-        dy(y, x) = dyVal;
-
-        mag(y, x) = norm(dxVal, dyVal);
-    }
-
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    template <
+        class T,
+        class Norm,
+        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+    >
+    __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
      {
          const int x = blockIdx.x * blockDim.x + threadIdx.x;
          const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,15 +156,6 @@ namespace canny
  
          if (cc30)
          {
-            cudaResourceDesc resDesc;
-            memset(&resDesc, 0, sizeof(resDesc));
-            resDesc.resType = cudaResourceTypePitch2D;
-            resDesc.res.pitch2D.devPtr = srcWhole.ptr();
-            resDesc.res.pitch2D.height = srcWhole.rows;
-            resDesc.res.pitch2D.width = srcWhole.cols;
-            resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
-            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
-
              cudaTextureDesc texDesc;
              memset(&texDesc, 0, sizeof(texDesc));
              texDesc.addressMode[0] = cudaAddressModeClamp;
@@ -178,9 +163,9 @@ namespace canny
              texDesc.addressMode[2] = cudaAddressModeClamp;
  
              cudaTextureObject_t tex = 0;
-            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+            createTextureObjectPitch2D(&tex, srcWhole, texDesc);
  
-            SrcTexObject src(xoff, yoff, tex);
+            SrcTexObj src(xoff, yoff, tex);
  
              if (L2Grad)
              {
@@ -205,7 +190,7 @@ namespace canny
          else
          {
              bindTexture(&tex_src, srcWhole);
-            SrcTex src(xoff, yoff);
+            SrcTexRef src(xoff, yoff);
  
              if (L2Grad)
              {
diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp

index a782a87..1b48e7d 100644 (file)
--- a/modules/cudaimgproc/test/test_canny.cpp
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -116,7 +116,7 @@ protected:
      bool useL2gradient;
  };
  
-#define NUM_STREAMS 64
+#define NUM_STREAMS 128
  
  CUDA_TEST_P(Canny, Async)
  {
diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu

index 66f0d66..2688e05 100644 (file)
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -45,6 +45,7 @@
  #include "opencv2/core/cuda/common.hpp"
  #include "opencv2/core/cuda/border_interpolate.hpp"
  #include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda.hpp"
  
  using namespace cv::cuda;
  using namespace cv::cuda::device;
@@ -101,11 +102,64 @@ namespace tvl1flow
          }
      }
  
+    struct SrcTex
+    {
+        virtual ~SrcTex() {}
+
+        __device__ __forceinline__ virtual float I1(float x, float y) const = 0;
+        __device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
+        __device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
+    };
+
      texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
      texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
      texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTexRef : SrcTex
+    {
+        __device__ __forceinline__ float I1(float x, float y) const override
+        {
+            return tex2D(tex_I1, x, y);
+        }
+        __device__ __forceinline__ float I1x(float x, float y) const override
+        {
+            return tex2D(tex_I1x, x, y);
+        }
+        __device__ __forceinline__ float I1y(float x, float y) const override
+        {
+            return tex2D(tex_I1y, x, y);
+        }
+    };
+
+    struct SrcTexObj : SrcTex
+    {
+        __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
+            : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
+
+        __device__ __forceinline__ float I1(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1, x, y);
+        }
+        __device__ __forceinline__ float I1x(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1x, x, y);
+        }
+        __device__ __forceinline__ float I1y(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1y, x, y);
+        }
  
-    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+        cudaTextureObject_t tex_obj_I1;
+        cudaTextureObject_t tex_obj_I1x;
+        cudaTextureObject_t tex_obj_I1y;
+    };
+
+    template <
+        typename T,
+        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+    >
+    __global__ void warpBackwardKernel(
+        const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
+        PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
      {
          const int x = blockIdx.x * blockDim.x + threadIdx.x;
          const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -136,9 +190,9 @@ namespace tvl1flow
              {
                  const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
  
-                sum  += w * tex2D(tex_I1 , cx, cy);
-                sumx += w * tex2D(tex_I1x, cx, cy);
-                sumy += w * tex2D(tex_I1y, cx, cy);
+                sum  += w * src.I1(cx, cy);
+                sumx += w * src.I1x(cx, cy);
+                sumy += w * src.I1y(cx, cy);
  
                  wsum += w;
              }
@@ -173,15 +227,46 @@ namespace tvl1flow
          const dim3 block(32, 8);
          const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
  
-        bindTexture(&tex_I1 , I1);
-        bindTexture(&tex_I1x, I1x);
-        bindTexture(&tex_I1y, I1y);
+        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
  
-        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
-        cudaSafeCall( cudaGetLastError() );
+        if (cc30)
+        {
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
  
-        if (!stream)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
+
+            createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
+            createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
+            createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
+
+            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
+            cudaSafeCall(cudaGetLastError());
+
+            if (!stream)
+                cudaSafeCall(cudaDeviceSynchronize());
+            else
+                cudaSafeCall(cudaStreamSynchronize(stream));
+
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
+        }
+        else
+        {
+            bindTexture(&tex_I1, I1);
+            bindTexture(&tex_I1x, I1x);
+            bindTexture(&tex_I1y, I1y);
+
+            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
+            cudaSafeCall(cudaGetLastError());
+
+            if (!stream)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
      }
  }
author	Namgoo Lee <namgoo.lee@cognex.com>
	Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)
committer	Namgoo Lee <namgoo.lee@cognex.com>
	Tue, 16 Jun 2020 16:04:22 +0000 (01:04 +0900)
modules/core/include/opencv2/core/cuda/common.hpp		patch \| blob \| history
modules/cudaimgproc/src/cuda/canny.cu		patch \| blob \| history
modules/cudaimgproc/test/test_canny.cpp		patch \| blob \| history
modules/cudaoptflow/src/cuda/tvl1flow.cu		patch \| blob \| history