implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo()

author Andrey Morozov <no@email>

Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)

committer Andrey Morozov <no@email>

Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)
author Andrey Morozov <no@email>
Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)
committer Andrey Morozov <no@email>
Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp

index c50351e..28544bc 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -49,24 +49,24 @@
  namespace cv\r
  {\r
      namespace gpu\r
-    {   \r
+    {\r
          //////////////////////////////// Initialization ////////////////////////\r
-                 \r
+\r
          //! This is the only function that do not throw exceptions if the library is compiled without Cuda.\r
          CV_EXPORTS int getCudaEnabledDeviceCount();\r
  \r
          //! Functions below throw cv::Expception if the library is compiled without Cuda.\r
          CV_EXPORTS string getDeviceName(int device);\r
-        CV_EXPORTS void setDevice(int device);        \r
-        CV_EXPORTS int getDevice();    \r
+        CV_EXPORTS void setDevice(int device);\r
+        CV_EXPORTS int getDevice();\r
  \r
          CV_EXPORTS void getComputeCapability(int device, int* major, int* minor);\r
          CV_EXPORTS int getNumberOfSMs(int device);\r
- \r
+\r
          //////////////////////////////// GpuMat ////////////////////////////////\r
-        class CudaStrem;\r
+        class CudaStream;\r
  \r
-        //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.        \r
+        //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.\r
          class CV_EXPORTS GpuMat\r
          {\r
          public:\r
@@ -81,7 +81,7 @@ namespace cv
              GpuMat(Size _size, int _type, const Scalar& _s);\r
              //! copy constructor\r
              GpuMat(const GpuMat& m);\r
-            \r
+\r
              //! constructor for GpuMatrix headers pointing to user-allocated data\r
              GpuMat(int _rows, int _cols, int _type, void* _data, size_t _step = Mat::AUTO_STEP);\r
              GpuMat(Size _size, int _type, void* _data, size_t _step = Mat::AUTO_STEP);\r
@@ -89,7 +89,7 @@ namespace cv
              //! creates a matrix header for a part of the bigger matrix\r
              GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);\r
              GpuMat(const GpuMat& m, const Rect& roi);\r
-                                    \r
+\r
              //! builds GpuMat from Mat. Perfom blocking upload to device.\r
              explicit GpuMat (const Mat& m);\r
  \r
@@ -99,7 +99,7 @@ namespace cv
              //! assignment operators\r
              GpuMat& operator = (const GpuMat& m);\r
              //! assignment operator. Perfom blocking upload to device.\r
-            GpuMat& operator = (const Mat& m);   \r
+            GpuMat& operator = (const Mat& m);\r
  \r
              //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.\r
              // Contains just image size, data ptr and step.\r
@@ -110,7 +110,7 @@ namespace cv
  \r
              //! Downloads data from device to host memory. Blocking calls.\r
              operator Mat() const;\r
-            void download(cv::Mat& m) const;       \r
+            void download(cv::Mat& m) const;\r
  \r
              //! returns a new GpuMatrix header for the specified row\r
              GpuMat row(int y) const;\r
@@ -161,7 +161,7 @@ namespace cv
              //! extracts a rectangular sub-GpuMatrix\r
              // (this is a generalized form of row, rowRange etc.)\r
              GpuMat operator()( Range rowRange, Range colRange ) const;\r
-            GpuMat operator()( const Rect& roi ) const;  \r
+            GpuMat operator()( const Rect& roi ) const;\r
  \r
              //! returns true iff the GpuMatrix data is continuous\r
              // (i.e. when there are no gaps between successive rows).\r
@@ -222,33 +222,33 @@ namespace cv
          // Page locked memory is only needed for async and faster coping to GPU.\r
          // It is convertable to cv::Mat header without reference counting\r
          // so you can use it with other opencv functions.\r
-                \r
+\r
          class CV_EXPORTS MatPL\r
          {\r
-        public:      \r
+        public:\r
  \r
              //Not supported.  Now behaviour is like ALLOC_DEFAULT.\r
              //enum { ALLOC_DEFAULT = 0, ALLOC_PORTABLE = 1, ALLOC_WRITE_COMBINED = 4 }\r
  \r
-            MatPL();        \r
-            MatPL(const MatPL& m);       \r
+            MatPL();\r
+            MatPL(const MatPL& m);\r
  \r
              MatPL(int _rows, int _cols, int _type);\r
-            MatPL(Size _size, int _type);                                                                \r
+            MatPL(Size _size, int _type);\r
  \r
              //! creates from cv::Mat with coping data\r
              explicit MatPL(const Mat& m);\r
-                                                            \r
-            ~MatPL();            \r
+\r
+            ~MatPL();\r
  \r
              MatPL& operator = (const MatPL& m);\r
-                                    \r
+\r
              //! returns deep copy of the matrix, i.e. the data is copied\r
              MatPL clone() const;\r
-                                                                       \r
-            //! allocates new matrix data unless the matrix already has specified size and type.            \r
+\r
+            //! allocates new matrix data unless the matrix already has specified size and type.\r
              void create(int _rows, int _cols, int _type);\r
-            void create(Size _size, int _type);                        \r
+            void create(Size _size, int _type);\r
  \r
              //! decrements reference counter and released memory if needed.\r
              void release();\r
@@ -256,25 +256,25 @@ namespace cv
              //! returns matrix header with disabled reference counting for MatPL data.\r
              Mat createMatHeader() const;\r
              operator Mat() const;\r
-                                                            \r
+\r
              // Please see cv::Mat for descriptions\r
-            bool isContinuous() const;            \r
-            size_t elemSize() const;            \r
-            size_t elemSize1() const;            \r
-            int type() const;            \r
-            int depth() const;            \r
-            int channels() const;            \r
-            size_t step1() const;            \r
-            Size size() const;            \r
+            bool isContinuous() const;\r
+            size_t elemSize() const;\r
+            size_t elemSize1() const;\r
+            int type() const;\r
+            int depth() const;\r
+            int channels() const;\r
+            size_t step1() const;\r
+            Size size() const;\r
              bool empty() const;\r
-                        \r
+\r
              // Please see cv::Mat for descriptions\r
-            int flags;            \r
-            int rows, cols;            \r
+            int flags;\r
+            int rows, cols;\r
              size_t step;\r
  \r
-            uchar* data;            \r
-            int* refcount; \r
+            uchar* data;\r
+            int* refcount;\r
  \r
              uchar* datastart;\r
              uchar* dataend;\r
@@ -288,37 +288,37 @@ namespace cv
          class CV_EXPORTS CudaStream\r
          {\r
          public:\r
-            CudaStream(); \r
+            CudaStream();\r
              ~CudaStream();\r
  \r
-            CudaStream(const CudaStream&); \r
+            CudaStream(const CudaStream&);\r
              CudaStream& operator=(const CudaStream&);\r
  \r
              bool queryIfComplete();\r
-            void waitForCompletion();             \r
+            void waitForCompletion();\r
  \r
-            //! downloads asynchronously. \r
+            //! downloads asynchronously.\r
              // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its subMat)\r
              void enqueueDownload(const GpuMat& src, MatPL& dst);\r
              void enqueueDownload(const GpuMat& src, Mat& dst);\r
  \r
-            //! uploads asynchronously. \r
+            //! uploads asynchronously.\r
              // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its ROI)\r
-            void enqueueUpload(const MatPL& src, GpuMat& dst);            \r
+            void enqueueUpload(const MatPL& src, GpuMat& dst);\r
              void enqueueUpload(const Mat& src, GpuMat& dst);\r
  \r
              void enqueueCopy(const GpuMat& src, GpuMat& dst);\r
-            \r
-            void enqueueMemSet(const GpuMat& src, Scalar val);            \r
+\r
+            void enqueueMemSet(const GpuMat& src, Scalar val);\r
              void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);\r
  \r
              // converts matrix type, ex from float to uchar depending on type\r
-            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); \r
+            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);\r
          private:\r
              void create();\r
              void release();\r
              struct Impl;\r
-            Impl *impl;                                              \r
+            Impl *impl;\r
              friend struct StreamAccessor;\r
          };\r
  \r
@@ -348,7 +348,7 @@ namespace cv
              //! Acync version\r
              void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream);\r
  \r
-            //! Some heuristics that tries to estmate \r
+            //! Some heuristics that tries to estmate\r
              // if current GPU will be faster then CPU in this algorithm.\r
              // It queries current active device.\r
              static bool checkIfGpuCallReasonable();\r
@@ -356,11 +356,11 @@ namespace cv
              int ndisp;\r
              int winSize;\r
              int preset;\r
-            \r
+\r
              // If avergeTexThreshold  == 0 => post procesing is disabled\r
              // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image\r
              // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold\r
-            // i.e. input left image is low textured.                       \r
+            // i.e. input left image is low textured.\r
              float avergeTexThreshold;\r
          private:\r
              GpuMat minSSD, leBuf, riBuf;\r
@@ -369,4 +369,4 @@ namespace cv
  }\r
  #include "opencv2/gpu/matrix_operations.hpp"\r
  \r
-#endif /* __OPENCV_GPU_HPP__ */
-\ No newline at end of file
+#endif /* __OPENCV_GPU_HPP__ */\r
diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp

index 0b6a63b..fbec7cf 100644 (file)
--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -61,12 +61,12 @@ namespace cv
          {\r
              static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }\r
  \r
-            extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels);\r
+            extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
  \r
-            extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels);\r
-            extern "C" void set_to_with_mask    (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels);\r
+            extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0);\r
+            extern "C" void set_to_with_mask    (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);\r
  \r
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta);\r
+            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0);\r
          }\r
      }\r
  }\r
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu

index 988cf7e..b3cb37d 100644 (file)
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -42,7 +42,6 @@
  
  #include <stddef.h>
  #include <stdio.h>
-//#include <iostream>
  #include "cuda_shared.hpp"
  #include "cuda_runtime.h"
  
@@ -239,19 +238,27 @@ namespace cv
  ////////////////////////////////// CopyTo /////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////
  
-                        typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels);
+                        typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
  
                          template<typename T>
-                        void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels)
+                        void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
                          {
                              dim3 threadsPerBlock(16,16, 1);
                              dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-                            ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
-                            ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-                            cudaSafeCall ( cudaThreadSynchronize() );
+                            if (stream == 0)
+                            {
+                                ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
+                                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+                                cudaSafeCall ( cudaThreadSynchronize() );
+                            }
+                            else
+                            {
+                                ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+                            }
                          }
  
-                        extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels)
+                        extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
                          {
                              static CopyToFunc tab[8] =
                              {
@@ -269,7 +276,7 @@ namespace cv
  
                              if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
  
-                            func(mat_src, mat_dst, mask, channels);
+                            func(mat_src, mat_dst, mask, channels, stream);
                          }
  
  
@@ -277,28 +284,43 @@ namespace cv
  ////////////////////////////////// SetTo //////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////
  
-                        typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels);
-                        typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels);
+                        typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+                        typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
  
                          template <typename T>
-                        void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels)
+                        void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
                          {
                              dim3 threadsPerBlock(32, 8, 1);
                              dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-                            ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-                            cudaSafeCall ( cudaThreadSynchronize() );
+                            if (stream == 0)
+                            {
+                                ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+                                cudaSafeCall ( cudaThreadSynchronize() );
+                            }
+                            else
+                            {
+                                ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+                            }
+
                          }
  
                          template <typename T>
-                        void set_to_without_mask_run(const DevMem2D& mat, int channels)
+                        void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
                          {
                              dim3 threadsPerBlock(32, 8, 1);
                              dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-                            ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-                            cudaSafeCall ( cudaThreadSynchronize() );
+                            if (stream == 0)
+                            {
+                                ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+                                cudaSafeCall ( cudaThreadSynchronize() );
+                            }
+                            else
+                            {
+                                ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+                            }
                          }
  
-                        extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels)
+                        extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
                          {
                              double data[4];
                              data[0] = scalar[0];
@@ -323,11 +345,11 @@ namespace cv
  
                              if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
  
-                            func(mat, channels);
+                            func(mat, channels, stream);
                          }
  
  
-                        extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels)
+                        extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
                          {
                              double data[4];
                              data[0] = scalar[0];
@@ -352,7 +374,7 @@ namespace cv
  
                              if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
  
-                            func(mat, mask, channels);
+                            func(mat, mask, channels, stream);
                          }
  
  
@@ -360,22 +382,27 @@ namespace cv
  //////////////////////////////// ConvertTo ////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////
  
-                                   typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta);
+                                   typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
  
                                     template<typename T, typename DT>
-                                   void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta)
+                                   void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
                                     {
                                         const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
  
                                          dim3 block(32, 8);
                                          dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
-
-                                        ::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-
-                                       cudaSafeCall( cudaThreadSynchronize() );
-                                   }
-
-                                   extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
+                                        if (stream == 0)
+                                        {
+                                            ::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+                                            cudaSafeCall( cudaThreadSynchronize() );
+                                        }
+                                        else
+                                        {
+                                            ::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+                                        }
+                                    }
+
+                                   extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
                                     {
                                             static CvtFunc tab[8][8] =
                                             {
@@ -406,7 +433,7 @@ namespace cv
                                             CvtFunc func = tab[sdepth][ddepth];
                                             if (func == 0)
                                                 cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-                                           func(src, dst, width, height, alpha, beta);
+                                           func(src, dst, width, height, alpha, beta, stream);
                                         }
                                 } // namespace impl
             } // namespace gpu
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp

index d17fdb0..8c5b69a 100644 (file)
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -74,6 +74,7 @@ struct CudaStream::Impl
      cudaStream_t stream;\r
      int ref_counter;\r
  };\r
+\r
  namespace\r
  {\r
      template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)\r
@@ -147,7 +148,7 @@ void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst)
  {\r
      // if not -> allocation will be done, but after that dst will not point to page locked memory\r
      CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )\r
-     devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);\r
+    devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);\r
  }\r
  void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }\r
  \r
author	Andrey Morozov <no@email>
	Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)
committer	Andrey Morozov <no@email>
	Mon, 26 Jul 2010 11:22:16 +0000 (11:22 +0000)
modules/gpu/include/opencv2/gpu/gpu.hpp		patch \| blob \| history
modules/gpu/src/cuda/cuda_shared.hpp		patch \| blob \| history
modules/gpu/src/cuda/matrix_operations.cu		patch \| blob \| history
modules/gpu/src/cudastream.cpp		patch \| blob \| history