fixed bug #2425 : Concurrent convolutions with streams

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp

index 1499841..f7a9deb 100644 (file)
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -146,7 +146,7 @@ PERF_TEST_P(ImagePair, Video_CreateOpticalFlowNeedleMap,
          }\r
  \r
          GPU_SANITY_CHECK(d_vertex);\r
-        GPU_SANITY_CHECK(d_colors)\r
+        GPU_SANITY_CHECK(d_colors);\r
      }\r
      else\r
      {\r
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu

index 7f5d247..21e28a8 100644 (file)
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -58,9 +58,12 @@ namespace cv { namespace gpu { namespace device
  \r
          __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
  \r
-        void loadKernel(const float kernel[], int ksize)\r
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)\r
          {\r
-            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
+            else\r
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );\r
          }\r
  \r
          template <int KSIZE, typename T, typename D, typename B>\r
@@ -185,7 +188,7 @@ namespace cv { namespace gpu { namespace device
          }\r
  \r
          template <typename T, typename D>\r
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
          {\r
              typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);\r
  \r
@@ -368,18 +371,18 @@ namespace cv { namespace gpu { namespace device
                  }\r
              };\r
  \r
-            loadKernel(kernel, ksize);\r
+            loadKernel(kernel, ksize, stream);\r
  \r
              callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);\r
          }\r
  \r
-        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
      } // namespace column_filter\r
  }}} // namespace cv { namespace gpu { namespace device\r
  \r
  \r
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */\r
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu

index 9a75c52..7fff187 100644 (file)
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -986,7 +986,10 @@ namespace cv { namespace gpu { namespace device
                  Filter2DCaller<T, D, BrdWrap>::call\r
              };\r
  \r
-            cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
+            else\r
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );\r
  \r
              funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);\r
          }\r
@@ -1001,4 +1004,4 @@ namespace cv { namespace gpu { namespace device
  }}} // namespace cv { namespace gpu { namespace device {\r
  \r
  \r
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */\r
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu

index 7076f38..7e73959 100644 (file)
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -58,9 +58,12 @@ namespace cv { namespace gpu { namespace device
  \r
          __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
  \r
-        void loadKernel(const float kernel[], int ksize)\r
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)\r
          {\r
-            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
+            else\r
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );\r
          }\r
  \r
          template <int KSIZE, typename T, typename D, typename B>\r
@@ -184,7 +187,7 @@ namespace cv { namespace gpu { namespace device
          }\r
  \r
          template <typename T, typename D>\r
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
          {\r
              typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);\r
  \r
@@ -367,18 +370,18 @@ namespace cv { namespace gpu { namespace device
                  }\r
              };\r
  \r
-            loadKernel(kernel, ksize);\r
+            loadKernel(kernel, ksize, stream);\r
  \r
              callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);\r
          }\r
  \r
-        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
-        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
      } // namespace row_filter\r
  }}} // namespace cv { namespace gpu { namespace device\r
  \r
  \r
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */\r
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp

index 82ccd85..fe4a68c 100644 (file)
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -835,13 +835,13 @@ namespace cv { namespace gpu { namespace device
      namespace row_filter\r
      {\r
          template <typename T, typename D>\r
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
      }\r
  \r
      namespace column_filter\r
      {\r
          template <typename T, typename D>\r
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
      }\r
  }}}\r
  \r
@@ -881,7 +881,7 @@ namespace
  \r
      struct GpuLinearRowFilter : public BaseRowFilter_GPU\r
      {\r
-        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :\r
+        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :\r
              BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}\r
  \r
          virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())\r
@@ -891,7 +891,7 @@ namespace
              func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));\r
          }\r
  \r
-        Mat kernel;\r
+        GpuMat kernel;\r
          gpuFilter1D_t func;\r
          int brd_type;\r
      };\r
@@ -926,11 +926,10 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
  \r
      CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));\r
  \r
-    Mat temp(rowKernel.size(), CV_32FC1);\r
-    rowKernel.convertTo(temp, CV_32FC1);\r
-    Mat cont_krnl = temp.reshape(1, 1);\r
+    GpuMat gpu_row_krnl;\r
+    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);\r
  \r
-    int ksize = cont_krnl.cols;\r
+    int ksize = gpu_row_krnl.cols;\r
  \r
      CV_Assert(ksize > 0 && ksize <= 32);\r
  \r
@@ -957,7 +956,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
          break;\r
      }\r
  \r
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType));\r
+    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, gpuBorderType));\r
  }\r
  \r
  namespace\r
@@ -991,7 +990,7 @@ namespace
  \r
      struct GpuLinearColumnFilter : public BaseColumnFilter_GPU\r
      {\r
-        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :\r
+        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :\r
              BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}\r
  \r
          virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())\r
@@ -1004,7 +1003,7 @@ namespace
              func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));\r
          }\r
  \r
-        Mat kernel;\r
+        GpuMat kernel;\r
          gpuFilter1D_t func;\r
          int brd_type;\r
      };\r
@@ -1039,11 +1038,10 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
  \r
      CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));\r
  \r
-    Mat temp(columnKernel.size(), CV_32FC1);\r
-    columnKernel.convertTo(temp, CV_32FC1);\r
-    Mat cont_krnl = temp.reshape(1, 1);\r
+    GpuMat gpu_col_krnl;\r
+    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);\r
  \r
-    int ksize = cont_krnl.cols;\r
+    int ksize = gpu_col_krnl.cols;\r
  \r
      CV_Assert(ksize > 0 && ksize <= 32);\r
  \r
@@ -1070,7 +1068,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
          break;\r
      }\r
  \r
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));\r
+    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, gpuBorderType));\r
  }\r
  \r
  Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,\r
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Wed, 10 Oct 2012 12:55:16 +0000 (16:55 +0400)
modules/gpu/perf/perf_video.cpp		patch \| blob \| history
modules/gpu/src/cuda/column_filter.cu		patch \| blob \| history
modules/gpu/src/cuda/imgproc.cu		patch \| blob \| history
modules/gpu/src/cuda/row_filter.cu		patch \| blob \| history
modules/gpu/src/filtering.cpp		patch \| blob \| history