added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts
authorVladislav Vinogradov <no@email>
Wed, 19 Oct 2011 09:53:22 +0000 (09:53 +0000)
committerVladislav Vinogradov <no@email>
Wed, 19 Oct 2011 09:53:22 +0000 (09:53 +0000)
fixed gpu module error reporting
added asynchronous version of some functions

16 files changed:
CMakeLists.txt
cvconfig.h.cmake
modules/core/include/opencv2/core/types_c.h
modules/core/src/system.cpp
modules/gpu/CMakeLists.txt
modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/arithm.cpp
modules/gpu/src/cascadeclassifier.cpp
modules/gpu/src/cuda/imgproc.cu
modules/gpu/src/cuda/match_template.cu
modules/gpu/src/cuda/safe_call.hpp
modules/gpu/src/error.cpp
modules/gpu/src/imgproc.cpp
modules/gpu/src/match_template.cpp
modules/gpu/src/optical_flow.cpp
modules/gpu/src/precomp.hpp

index 628c23f..ff8f20c 100644 (file)
@@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support")
 
 if( CMAKE_VERSION VERSION_GREATER "2.8")
     set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 else()
     set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 endif()
 
 set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support")
@@ -995,6 +999,15 @@ if(WITH_CUDA)
     
     if(CUDA_FOUND)
         set(HAVE_CUDA 1)
+
+        if(WITH_CUFFT)
+            set(HAVE_CUFFT 1)
+        endif()
+
+        if(WITH_CUBLAS)
+            set(HAVE_CUBLAS 1)
+        endif()
+
         message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
         set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
index 602c12d..36cf6bb 100644 (file)
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA
 
+/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+#cmakedefine HAVE_CUFFT
+
+/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+#cmakedefine HAVE_CUBLAS
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
 
index 11dbabf..e2284a8 100644 (file)
@@ -250,9 +250,7 @@ enum {
  CV_StsBadMemBlock=            -214, /* an allocated block has been corrupted */
  CV_StsAssert=                 -215, /* assertion failed */    
  CV_GpuNotSupported=           -216,  
- CV_GpuApiCallError=           -217, 
- CV_GpuNppCallError=           -218,
- CV_GpuCufftCallError=         -219
+ CV_GpuApiCallError=           -217
 };
 
 /****************************************************************************************\
index 2922cda..7c9b805 100644 (file)
@@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status )
     case CV_StsNotImplemented :      return "The function/feature is not implemented";
     case CV_StsBadMemBlock :         return "Memory block has been corrupted";
     case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported : return "No GPU support";
-    case CV_GpuApiCallError : return "Gpu Api call";
-    case CV_GpuNppCallError : return "Npp Api call";
+    case CV_GpuNotSupported :        return "No GPU support";
+    case CV_GpuApiCallError :        return "Gpu Api call";
     };
 
     sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);
index 6c1a222..74ccc32 100644 (file)
@@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
 
 if (HAVE_CUDA)
-    target_link_libraries(${the_target} ${CUDA_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    target_link_libraries(${the_target} ${CUDA_LIBRARIES})
 
     unset(CUDA_npp_LIBRARY CACHE)
     find_cuda_helper_libs(npp)
     target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
+
+    if(HAVE_CUFFT)
+        CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    endif()
+
+    if(HAVE_CUBLAS)
+        CUDA_ADD_CUBLAS_TO_TARGET(${the_target})
+    endif()
 endif()
 
 if(MSVC)
index 1b7efb9..ee9b734 100644 (file)
@@ -141,8 +141,8 @@ namespace cv
 \r
         //////////////////////////////// Error handling ////////////////////////\r
 \r
-        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);\r
-        CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);\r
+        //CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);\r
+        //CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);\r
 \r
         //////////////////////////////// CudaMem ////////////////////////////////\r
         // CudaMem is limited cv::Mat with page locked memory allocation.\r
@@ -628,11 +628,11 @@ namespace cv
 \r
         //! Does mean shift filtering on GPU.\r
         CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,\r
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));\r
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());\r
 \r
         //! Does mean shift procedure on GPU.\r
         CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,\r
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));\r
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());\r
 \r
         //! Does mean shift segmentation with elimination of small regions.\r
         CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,\r
@@ -683,10 +683,12 @@ namespace cv
         //! rotate 8bit single or four channel image\r
         //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
         //! supports CV_8UC1, CV_8UC4 types\r
-        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
+        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, \r
+            int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
 \r
         //! copies 2D array to a larger destination array and pads borders with user-specifiable constant\r
-        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null());\r
+        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, \r
+            const Scalar& value = Scalar(), Stream& stream = Stream::Null());\r
 \r
         //! computes the integral image\r
         //! sum will have CV_32S type, but will contain unsigned int values\r
@@ -715,21 +717,26 @@ namespace cv
         CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());\r
 \r
         //! computes Harris cornerness criteria at each image pixel\r
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);\r
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);\r
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, \r
+            int borderType = BORDER_REFLECT101);\r
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, \r
+            int borderType = BORDER_REFLECT101);\r
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, \r
+            int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());\r
 \r
         //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria\r
         CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
         CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
+        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, \r
+            int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());\r
 \r
         //! performs per-element multiplication of two full (not packed) Fourier spectrums\r
         //! supports 32FC2 matrixes only (interleaved format)\r
-        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false);\r
+        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());\r
 \r
         //! performs per-element multiplication of two full (not packed) Fourier spectrums\r
         //! supports 32FC2 matrixes only (interleaved format)\r
-        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, \r
-                                             float scale, bool conjB=false);\r
+        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());\r
 \r
         //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.\r
         //! Param dft_size is the size of DFT transform.\r
@@ -742,19 +749,14 @@ namespace cv
         //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.\r
         //!\r
         //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.\r
-        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0);\r
+        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());\r
 \r
         //! computes convolution (or cross-correlation) of two images using discrete Fourier transform\r
         //! supports source images of 32FC1 type only\r
         //! result matrix will have 32FC1 type\r
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, \r
-                                 bool ccorr=false);\r
-\r
         struct CV_EXPORTS ConvolveBuf;\r
-\r
-        //! buffered version\r
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, \r
-                                 bool ccorr, ConvolveBuf& buf);\r
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);\r
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());\r
 \r
         struct CV_EXPORTS ConvolveBuf\r
         {\r
@@ -766,7 +768,7 @@ namespace cv
 \r
         private:\r
             static Size estimateBlockSize(Size result_size, Size templ_size);\r
-            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&);\r
+            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);\r
 \r
             Size result_size;\r
             Size block_size;\r
@@ -778,7 +780,7 @@ namespace cv
         };\r
 \r
         //! computes the proximity map for the raster template and the image where the template is searched for\r
-        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);\r
+        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());\r
 \r
         //! smoothes the source image and downsamples it\r
         CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());\r
index 67da283..ec938fe 100644 (file)
@@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
         sz.width  = src.cols;\r
         sz.height = src.rows;\r
 \r
-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), \r
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), \r
             dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );\r
     }\r
     else // if (src.elemSize() == 8)\r
@@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
         sz.width  = src.cols;\r
         sz.height = src.rows;\r
 \r
-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), \r
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), \r
             dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );              \r
     }\r
 \r
index 984b33f..6bef7fb 100644 (file)
@@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
     CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)\r
     {\r
         ncvSetDebugOutputHandler(NCVDebugOutputHandler);\r
-        if (ncvStat != load(filename))\r
-        {\r
-            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");\r
-        }\r
+        ncvSafeCall( load(filename) );\r
     }\r
 \r
 \r
@@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
     }\r
 \r
     unsigned int numDetections;\r
-    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);\r
-    if (ncvStat != NCV_SUCCESS)\r
-    {\r
-        CV_Error(CV_GpuApiCallError, "Error in face detectioln");\r
-    }\r
+    ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) );\r
 \r
     return numDetections;\r
 }\r
index 4d54895..8149945 100644 (file)
@@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc
         return make_short2((short)x0, (short)y0);\r
     }\r
 \r
-    extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, \r
-                                                 int sp, int sr, int maxIter, float eps )\r
+    __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )\r
     {\r
         int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
         int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc
             do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
     }\r
 \r
-    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, \r
-                                                 unsigned char* outsp, size_t outspstep, \r
-                                                 int cols, int rows, \r
-                                                 int sp, int sr, int maxIter, float eps )\r
+    __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, \r
+                                         unsigned char* outsp, size_t outspstep, \r
+                                         int cols, int rows, \r
+                                         int sp, int sr, int maxIter, float eps)\r
     {\r
         int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
         int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc
         }\r
     }\r
 \r
-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps)\r
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)\r
     {\r
         dim3 grid(1, 1, 1);\r
         dim3 threads(32, 8, 1);\r
@@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
 \r
-        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
+        meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
     }\r
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps) \r
+\r
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) \r
     {\r
         dim3 grid(1, 1, 1);\r
         dim3 threads(32, 8, 1);\r
@@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
 \r
-        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
+        meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
     }\r
 \r
 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////\r
@@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc
         }\r
     }\r
 \r
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)\r
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)\r
     {\r
         dim3 threads(32, 8);\r
         dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
 \r
-        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
+        extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
     }\r
 \r
 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////\r
@@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
     void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, \r
-                             int border_type)\r
+                             int border_type, cudaStream_t stream)\r
     {\r
         const int rows = Dx.rows;\r
         const int cols = Dx.cols;\r
@@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc
         switch (border_type) \r
         {\r
         case BORDER_REFLECT101_GPU:\r
-            cornerHarris_kernel<<<grid, threads>>>(\r
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(\r
                     cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
             break;\r
         case BORDER_REPLICATE_GPU:\r
@@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc
             harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
             harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
             harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
-            cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);\r
+\r
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);\r
             break;\r
         }\r
 \r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
 \r
-        cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
-        cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
+        //cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
+        //cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
     }\r
 \r
 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////\r
@@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
     void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,\r
-                                  int border_type)\r
+                                  int border_type, cudaStream_t stream)\r
     {\r
         const int rows = Dx.rows;\r
         const int cols = Dx.cols;\r
@@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc
         switch (border_type)\r
         {\r
         case BORDER_REFLECT101_GPU:\r
-            cornerMinEigenVal_kernel<<<grid, threads>>>(\r
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(\r
                     cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
             break;\r
         case BORDER_REPLICATE_GPU:\r
@@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc
             minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
             minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
             minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
-            cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);\r
+\r
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);\r
             break;\r
         }\r
 \r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall(cudaDeviceSynchronize());\r
+        if (stream == 0)\r
+            cudaSafeCall(cudaDeviceSynchronize());\r
 \r
-        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
-        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
     }\r
 \r
 ////////////////////////////// Column Sum //////////////////////////////////////\r
@@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc
     //////////////////////////////////////////////////////////////////////////\r
     // mulSpectrums\r
 \r
-    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-                                       DevMem2D_<cufftComplex> c)\r
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
     {\r
         const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
         const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
@@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
 \r
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-                      DevMem2D_<cufftComplex> c)\r
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
     {\r
         dim3 threads(256);\r
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 \r
-        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);\r
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
     }\r
 \r
 \r
     //////////////////////////////////////////////////////////////////////////\r
     // mulSpectrums_CONJ\r
 \r
-    __global__ void mulSpectrumsKernel_CONJ(\r
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-            DevMem2D_<cufftComplex> c)\r
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
     {\r
         const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
         const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
@@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
 \r
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-                           DevMem2D_<cufftComplex> c)\r
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
     {\r
         dim3 threads(256);\r
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 \r
-        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);\r
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
     }\r
 \r
 \r
     //////////////////////////////////////////////////////////////////////////\r
     // mulAndScaleSpectrums\r
 \r
-    __global__ void mulAndScaleSpectrumsKernel(\r
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-            float scale, DevMem2D_<cufftComplex> c)\r
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
     {\r
         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
 \r
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-                              float scale, DevMem2D_<cufftComplex> c)\r
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
     {\r
         dim3 threads(256);\r
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 \r
-        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);\r
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
     }\r
 \r
 \r
     //////////////////////////////////////////////////////////////////////////\r
     // mulAndScaleSpectrums_CONJ\r
 \r
-    __global__ void mulAndScaleSpectrumsKernel_CONJ(\r
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-            float scale, DevMem2D_<cufftComplex> c)\r
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
     {\r
         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc
     }\r
 \r
 \r
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-                                  float scale, DevMem2D_<cufftComplex> c)\r
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
     {\r
         dim3 threads(256);\r
         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 \r
-        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);\r
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
     }    \r
 \r
     //////////////////////////////////////////////////////////////////////////\r
@@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc
         }\r
     }\r
 \r
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)\r
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)\r
     {\r
         cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
 \r
         const dim3 block(16, 16);\r
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
 \r
-        convolve<<<grid, block>>>(src, dst, kWidth, kHeight);\r
+        convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);\r
         cudaSafeCall(cudaGetLastError());\r
 \r
-        cudaSafeCall(cudaDeviceSynchronize());\r
+        if (stream == 0)\r
+            cudaSafeCall(cudaDeviceSynchronize());\r
     }\r
 \r
 \r
index 242cf3c..e954a26 100644 (file)
@@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a
 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }\r
 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Naive_CCORR\r
 \r
-template <typename T, int cn>\r
-__global__ void matchTemplateNaiveKernel_CCORR(\r
-        int w, int h, const PtrStepb image, const PtrStepb templ, \r
-        DevMem2Df result)\r
+template <typename T, int cn> \r
+__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
 {\r
     typedef typename TypeVec<T, cn>::vec_type Type;\r
     typedef typename TypeVec<float, cn>::vec_type Typef;\r
@@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR(
     }\r
 }\r
 \r
+template <typename T, int cn>\r
+void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
+{\r
+    const dim3 threads(32, 8);\r
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
 \r
-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ,\r
-                                  DevMem2Df result, int cn)\r
+    matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
 \r
-    switch (cn)\r
+    static const caller_t callers[] = \r
     {\r
-    case 1:\r
-        matchTemplateNaiveKernel_CCORR<float, 1><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 2:\r
-        matchTemplateNaiveKernel_CCORR<float, 2><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 3:\r
-        matchTemplateNaiveKernel_CCORR<float, 3><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 4:\r
-        matchTemplateNaiveKernel_CCORR<float, 4><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    }\r
-    cudaSafeCall( cudaGetLastError() );\r
+        0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>\r
+    };\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    callers[cn](image, templ, result, stream);\r
 }\r
 \r
 \r
-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ,\r
-                                 DevMem2Df result, int cn)\r
+void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
 \r
-    switch (cn)\r
+    static const caller_t callers[] = \r
     {\r
-    case 1:\r
-        matchTemplateNaiveKernel_CCORR<uchar, 1><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 2:\r
-        matchTemplateNaiveKernel_CCORR<uchar, 2><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 3:\r
-        matchTemplateNaiveKernel_CCORR<uchar, 3><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 4:\r
-        matchTemplateNaiveKernel_CCORR<uchar, 4><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    }\r
-    cudaSafeCall( cudaGetLastError() );\r
+        0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>\r
+    };\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    callers[cn](image, templ, result, stream);\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Naive_SQDIFF\r
 \r
 template <typename T, int cn>\r
-__global__ void matchTemplateNaiveKernel_SQDIFF(\r
-        int w, int h, const PtrStepb image, const PtrStepb templ, \r
-        DevMem2Df result)\r
+__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)\r
 {\r
     typedef typename TypeVec<T, cn>::vec_type Type;\r
     typedef typename TypeVec<float, cn>::vec_type Typef;\r
@@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
     }\r
 }\r
 \r
-\r
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ,\r
-                                   DevMem2Df result, int cn)\r
+template <typename T, int cn>\r
+void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+    const dim3 threads(32, 8);\r
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
 \r
-    switch (cn)\r
-    {\r
-    case 1:\r
-        matchTemplateNaiveKernel_SQDIFF<float, 1><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 2:\r
-        matchTemplateNaiveKernel_SQDIFF<float, 2><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 3:\r
-        matchTemplateNaiveKernel_SQDIFF<float, 3><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 4:\r
-        matchTemplateNaiveKernel_SQDIFF<float, 4><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    }\r
+    matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
+void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
+\r
+    static const caller_t callers[] = \r
+    {\r
+        0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>\r
+    };\r
+\r
+    callers[cn](image, templ, result, stream);\r
+}\r
 \r
-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ,\r
-                                  DevMem2Df result, int cn)\r
+void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);\r
 \r
-    switch (cn)\r
+    static const caller_t callers[] = \r
     {\r
-    case 1:\r
-        matchTemplateNaiveKernel_SQDIFF<uchar, 1><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 2:\r
-        matchTemplateNaiveKernel_SQDIFF<uchar, 2><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 3:\r
-        matchTemplateNaiveKernel_SQDIFF<uchar, 3><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    case 4:\r
-        matchTemplateNaiveKernel_SQDIFF<uchar, 4><<<grid, threads>>>(\r
-                templ.cols, templ.rows, image, templ, result);\r
-        break;\r
-    }\r
-    cudaSafeCall( cudaGetLastError() );\r
+        0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>\r
+    };\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    callers[cn](image, templ, result, stream);\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Prepared_SQDIFF\r
 \r
 template <int cn>\r
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(\r
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, \r
-        unsigned int templ_sqsum, DevMem2Df result)\r
+__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
 {\r
     const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
     const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(
     }\r
 }\r
 \r
+template <int cn>\r
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)\r
+{\r
+    const dim3 threads(32, 8);\r
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+    matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+    cudaSafeCall( cudaGetLastError() );\r
 \r
-void matchTemplatePrepared_SQDIFF_8U(\r
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-        unsigned int templ_sqsum, DevMem2Df result, int cn)\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, \r
+                                     cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    switch (cn)\r
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
+\r
+    static const caller_t callers[] = \r
     {\r
-    case 1:\r
-        matchTemplatePreparedKernel_SQDIFF_8U<1><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 2:\r
-        matchTemplatePreparedKernel_SQDIFF_8U<2><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 3:\r
-        matchTemplatePreparedKernel_SQDIFF_8U<3><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 4:\r
-        matchTemplatePreparedKernel_SQDIFF_8U<4><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    }\r
-    cudaSafeCall( cudaGetLastError() );\r
+        0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>\r
+    };\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Prepared_SQDIFF_NORMED\r
 \r
 // normAcc* are accurate normalization routines which make GPU matchTemplate\r
 // consistent with CPU one\r
@@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum)
 \r
 \r
 template <int cn>\r
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(\r
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, \r
-        unsigned int templ_sqsum, DevMem2Df result)\r
+__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)\r
 {\r
     const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
     const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
     }\r
 }\r
 \r
+template <int cn>\r
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
+                                            DevMem2Df result, cudaStream_t stream)\r
+{\r
+    const dim3 threads(32, 8);\r
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
+    matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
 \r
-void matchTemplatePrepared_SQDIFF_NORMED_8U(\r
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-        unsigned int templ_sqsum, DevMem2Df result, int cn)\r
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, \r
+                                            DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
-    dim3 threads(32, 8);\r
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    switch (cn)\r
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);\r
+    static const caller_t callers[] = \r
     {\r
-    case 1:\r
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 2:\r
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 3:\r
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    case 4:\r
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<<grid, threads>>>(\r
-                w, h, image_sqsum, templ_sqsum, result);\r
-        break;\r
-    }\r
-    cudaSafeCall( cudaGetLastError() );\r
+        0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>\r
+    };\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Prepared_CCOFF\r
 \r
-__global__ void matchTemplatePreparedKernel_CCOFF_8U(\r
-        int w, int h, float templ_sum_scale, \r
-        const PtrStep<unsigned int> image_sum, DevMem2Df result)\r
+__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)\r
 {\r
     const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
     const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
@@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
     }\r
 }\r
 \r
-\r
-void matchTemplatePrepared_CCOFF_8U(\r
-        int w, int h, const DevMem2D_<unsigned int> image_sum,\r
-        unsigned int templ_sum, DevMem2Df result)\r
+void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(\r
-            w, h, (float)templ_sum / (w * h), image_sum, result);\r
+\r
+    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(\r
         int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,\r
         const PtrStep<unsigned int> image_sum_r,\r
@@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_8UC2(\r
         int w, int h, \r
         const DevMem2D_<unsigned int> image_sum_r, \r
         const DevMem2D_<unsigned int> image_sum_g,\r
         unsigned int templ_sum_r, unsigned int templ_sum_g, \r
-        DevMem2Df result)\r
+        DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(\r
             w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),\r
             image_sum_r, image_sum_g, result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(\r
         int w, int h, \r
         float templ_sum_scale_r,\r
@@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_8UC3(\r
         int w, int h, \r
         const DevMem2D_<unsigned int> image_sum_r, \r
@@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
         unsigned int templ_sum_r, \r
         unsigned int templ_sum_g, \r
         unsigned int templ_sum_b, \r
-        DevMem2Df result)\r
+        DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(\r
             w, h, \r
             (float)templ_sum_r / (w * h),\r
             (float)templ_sum_g / (w * h),\r
@@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
             image_sum_r, image_sum_g, image_sum_b, result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(\r
         int w, int h, \r
         float templ_sum_scale_r, \r
@@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_8UC4(\r
         int w, int h, \r
         const DevMem2D_<unsigned int> image_sum_r, \r
@@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
         unsigned int templ_sum_g, \r
         unsigned int templ_sum_b, \r
         unsigned int templ_sum_a, \r
-        DevMem2Df result)\r
+        DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
-    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(\r
             w, h, \r
             (float)templ_sum_r / (w * h), \r
             (float)templ_sum_g / (w * h), \r
@@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
             result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// Prepared_CCOFF_NORMED\r
 \r
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(\r
         int w, int h, float weight, \r
@@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_NORMED_8U(\r
             int w, int h, const DevMem2D_<unsigned int> image_sum, \r
             const DevMem2D_<unsigned long long> image_sqsum,\r
             unsigned int templ_sum, unsigned int templ_sqsum,\r
-            DevMem2Df result)\r
+            DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
@@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
     float weight = 1.f / (w * h);\r
     float templ_sum_scale = templ_sum * weight;\r
     float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;\r
-    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(\r
             w, h, weight, templ_sum_scale, templ_sqsum_scale, \r
             image_sum, image_sqsum, result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(\r
         int w, int h, float weight, \r
         float templ_sum_scale_r, float templ_sum_scale_g, \r
@@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
             const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
-            DevMem2Df result)\r
+            DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
@@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
     float templ_sum_scale_g = templ_sum_g * weight;\r
     float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
                                + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;\r
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(\r
             w, h, weight, \r
             templ_sum_scale_r, templ_sum_scale_g,\r
             templ_sqsum_scale,\r
@@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
             result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(\r
         int w, int h, float weight, \r
         float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
@@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
@@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
-            DevMem2Df result)\r
+            DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
@@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
     float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r \r
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
                               + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;\r
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(\r
             w, h, weight, \r
             templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, \r
             templ_sqsum_scale, \r
@@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
             result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
 \r
+\r
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(\r
         int w, int h, float weight, \r
         float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, \r
@@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
     }\r
 }\r
 \r
-\r
 void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
@@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
             unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
-            DevMem2Df result)\r
+            DevMem2Df result, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
@@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g\r
                               + templ_sqsum_b - weight * templ_sum_b * templ_sum_b\r
                               + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;\r
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads>>>(\r
+\r
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(\r
             w, h, weight, \r
             templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, \r
             templ_sqsum_scale, \r
@@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
             result);\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// normalize\r
 \r
 template <int cn>\r
 __global__ void normalizeKernel_8U(\r
@@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U(
     }\r
 }\r
 \r
-\r
 void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-                  unsigned int templ_sqsum, DevMem2Df result, int cn)\r
+                  unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+\r
     switch (cn)\r
     {\r
     case 1:\r
-        normalizeKernel_8U<1><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
+        normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
         break;\r
     case 2:\r
-        normalizeKernel_8U<2><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
+        normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
         break;\r
     case 3:\r
-        normalizeKernel_8U<3><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
+        normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
         break;\r
     case 4:\r
-        normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
+        normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);\r
         break;\r
     }\r
+\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
 \r
+//////////////////////////////////////////////////////////////////////\r
+// extractFirstChannel\r
 \r
 template <int cn>\r
 __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)\r
@@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
     }\r
 }\r
 \r
-\r
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)\r
+void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)\r
 {\r
     dim3 threads(32, 8);\r
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
@@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
     switch (cn)\r
     {\r
     case 1:\r
-        extractFirstChannel_32F<1><<<grid, threads>>>(image, result);\r
+        extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);\r
         break;\r
     case 2:\r
-        extractFirstChannel_32F<2><<<grid, threads>>>(image, result);\r
+        extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);\r
         break;\r
     case 3:\r
-        extractFirstChannel_32F<3><<<grid, threads>>>(image, result);\r
+        extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);\r
         break;\r
     case 4:\r
-        extractFirstChannel_32F<4><<<grid, threads>>>(image, result);\r
+        extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);\r
         break;\r
     }\r
     cudaSafeCall( cudaGetLastError() );\r
 \r
-    cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
 }\r
-\r
-\r
 }}}\r
-\r
index e3e00be..6e0c219 100644 (file)
 \r
 #include "cuda_runtime_api.h"\r
 #include "cufft.h"\r
-//#include <nppdefs.h>\r
+#include "NCV.hpp"\r
 \r
 #if defined(__GNUC__)\r
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)\r
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)\r
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)\r
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)\r
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)\r
 #else /* defined(__CUDACC__) || defined(__MSVC__) */\r
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)\r
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)\r
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)\r
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)\r
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)\r
 #endif\r
 \r
 namespace cv\r
@@ -62,8 +64,9 @@ namespace cv
     namespace gpu\r
     {\r
         void error(const char *error_string, const char *file, const int line, const char *func = "");\r
-        void nppError(int err, const char *file, const int line, const char *func = "");   \r
-        void cufftError(int err, const char *file, const int line, const char *func = "");   \r
+        void nppError(int err, const char *file, const int line, const char *func = "");\r
+        void ncvError(int err, const char *file, const int line, const char *func = "");\r
+        void cufftError(int err, const char *file, const int line, const char *func = "");\r
 \r
         static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")\r
         {\r
@@ -71,17 +74,23 @@ namespace cv
                 cv::gpu::error(cudaGetErrorString(err), file, line, func);\r
         }\r
 \r
-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (CUFFT_SUCCESS != err)\r
-                cv::gpu::cufftError(err, file, line, func);\r
-        }\r
-\r
         static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")\r
         {\r
             if (err < 0)\r
                 cv::gpu::nppError(err, file, line, func);\r
         }\r
+\r
+        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")\r
+        {\r
+            if (NCV_SUCCESS != err)\r
+                cv::gpu::ncvError(err, file, line, func);\r
+        }\r
+\r
+        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
+        {\r
+            if (CUFFT_SUCCESS != err)\r
+                cv::gpu::cufftError(err, file, line, func);\r
+        }\r
     }\r
 }\r
 \r
index 1f38f61..e5090db 100644 (file)
 \r
 #include "precomp.hpp"\r
 \r
-\r
 using namespace cv;\r
 using namespace cv::gpu;\r
+using namespace std;\r
 \r
-\r
-#if !defined (HAVE_CUDA)\r
-\r
-#else /* !defined (HAVE_CUDA) */\r
-\r
+#ifdef HAVE_CUDA\r
 \r
 namespace \r
 {\r
     #define error_entry(entry)  { entry, #entry }\r
 \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // NPP errors\r
-\r
-    struct NppError\r
+    struct ErrorEntry\r
     {\r
-        int error;\r
+        int code;\r
         string str;\r
-    } \r
+    }; \r
+\r
+    struct ErrorEntryComparer\r
+    {\r
+        int code;\r
+        ErrorEntryComparer(int code_) : code(code_) {};\r
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }\r
+    };\r
+\r
+    string getErrorString(int code, const ErrorEntry* errors, size_t n)\r
+    {\r
+        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;\r
+\r
+        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");\r
+\r
+        ostringstream ostr;\r
+        ostr << msg << " [Code = " << code << "]";\r
+\r
+        return ostr.str();\r
+    }\r
+\r
+    //////////////////////////////////////////////////////////////////////////\r
+    // NPP errors\r
     \r
-    npp_errors [] = \r
+    const ErrorEntry npp_errors [] = \r
     {\r
         error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),\r
         error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),\r
@@ -74,6 +89,7 @@ namespace
 #if defined (_MSC_VER)\r
         error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),\r
 #endif\r
+\r
         error_entry( NPP_BAD_ARG_ERROR ),\r
         error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),\r
         error_entry( NPP_TEXTURE_BIND_ERROR ),\r
@@ -110,106 +126,116 @@ namespace
         error_entry( NPP_ODD_ROI_WARNING )\r
     };\r
 \r
-    const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);\r
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);\r
 \r
-    struct Searcher\r
+    //////////////////////////////////////////////////////////////////////////\r
+    // NCV errors\r
+    \r
+    const ErrorEntry ncv_errors [] = \r
     {\r
-        int err;\r
-        Searcher(int err_) : err(err_) {};\r
-        bool operator()(const NppError& e) const { return e.error == err; }\r
+        error_entry( NCV_SUCCESS ),\r
+        error_entry( NCV_UNKNOWN_ERROR ),\r
+        error_entry( NCV_CUDA_ERROR ),\r
+        error_entry( NCV_NPP_ERROR ),\r
+        error_entry( NCV_FILE_ERROR ),\r
+        error_entry( NCV_NULL_PTR ),\r
+        error_entry( NCV_INCONSISTENT_INPUT ),\r
+        error_entry( NCV_TEXTURE_BIND_ERROR ),\r
+        error_entry( NCV_DIMENSIONS_INVALID ),\r
+        error_entry( NCV_INVALID_ROI ),\r
+        error_entry( NCV_INVALID_STEP ),\r
+        error_entry( NCV_INVALID_SCALE ),\r
+        error_entry( NCV_INVALID_SCALE ),\r
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),\r
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),\r
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),\r
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),\r
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),\r
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),\r
+        error_entry( NCV_MEM_COPY_ERROR ),\r
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),\r
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),\r
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),\r
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),\r
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),\r
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),\r
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),\r
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),\r
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),\r
+        error_entry( NPPST_SUCCESS ),\r
+        error_entry( NPPST_ERROR ),\r
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),\r
+        error_entry( NPPST_NULL_POINTER_ERROR ),\r
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),\r
+        error_entry( NPPST_MEMCPY_ERROR ),\r
+        error_entry( NPPST_MEM_ALLOC_ERR ),\r
+        error_entry( NPPST_MEMFREE_ERR ),\r
+        error_entry( NPPST_INVALID_ROI ),\r
+        error_entry( NPPST_INVALID_STEP ),\r
+        error_entry( NPPST_INVALID_SCALE ),\r
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),\r
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),\r
+        error_entry( NPPST_MEM_INTERNAL_ERROR )\r
     };\r
 \r
+    const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);\r
+\r
     //////////////////////////////////////////////////////////////////////////\r
     // CUFFT errors\r
 \r
-    struct CufftError\r
-    {\r
-        int code;\r
-        string message;\r
-    };\r
-\r
-    const CufftError cufft_errors[] = \r
-    {\r
-        error_entry(CUFFT_INVALID_PLAN),\r
-        error_entry(CUFFT_ALLOC_FAILED),\r
-        error_entry(CUFFT_INVALID_TYPE),\r
-        error_entry(CUFFT_INVALID_VALUE),\r
-        error_entry(CUFFT_INTERNAL_ERROR),\r
-        error_entry(CUFFT_EXEC_FAILED),\r
-        error_entry(CUFFT_SETUP_FAILED),\r
-        error_entry(CUFFT_INVALID_SIZE),\r
-        error_entry(CUFFT_UNALIGNED_DATA)\r
-    };\r
-\r
-    struct CufftErrorComparer\r
+    const ErrorEntry cufft_errors[] = \r
     {\r
-        CufftErrorComparer(int code_): code(code_) {}\r
-        bool operator()(const CufftError& other) const \r
-        { \r
-            return other.code == code; \r
-        }\r
-        int code;\r
+        error_entry( CUFFT_INVALID_PLAN ),\r
+        error_entry( CUFFT_ALLOC_FAILED ),\r
+        error_entry( CUFFT_INVALID_TYPE ),\r
+        error_entry( CUFFT_INVALID_VALUE ),\r
+        error_entry( CUFFT_INTERNAL_ERROR ),\r
+        error_entry( CUFFT_EXEC_FAILED ),\r
+        error_entry( CUFFT_SETUP_FAILED ),\r
+        error_entry( CUFFT_INVALID_SIZE ),\r
+        error_entry( CUFFT_UNALIGNED_DATA )\r
     };\r
 \r
     const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);\r
-\r
 }\r
 \r
 namespace cv\r
 {\r
     namespace gpu\r
     {\r
-        const string getNppErrorString( int err )\r
-        {\r
-            size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;\r
-            const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");\r
-\r
-            std::stringstream interpreter;\r
-            interpreter << msg <<" [Code = " << err << "]";\r
-\r
-            return interpreter.str();\r
-        }\r
-\r
-        void nppError( int err, const char *file, const int line, const char *func)\r
-        {                    \r
-            cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) );                \r
-        }\r
-\r
-        const string getCufftErrorString(int err_code)\r
-        {\r
-            const CufftError* cufft_error = std::find_if(\r
-                    cufft_errors, cufft_errors + cufft_error_num, \r
-                    CufftErrorComparer(err_code));\r
-\r
-            bool found = cufft_error != cufft_errors + cufft_error_num;\r
-\r
-            std::stringstream ss;\r
-            ss << (found ? cufft_error->message : "Unknown error code");\r
-            ss << " [Code = " << err_code << "]";\r
-\r
-            return ss.str();\r
-        }\r
-\r
-        void cufftError(int err, const char *file, const int line, const char *func)\r
-        {\r
-            cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line));\r
-        }\r
-\r
         void error(const char *error_string, const char *file, const int line, const char *func)\r
         {          \r
             int code = CV_GpuApiCallError;\r
 \r
-            if (std::uncaught_exception())\r
+            if (uncaught_exception())\r
             {\r
                 const char* errorStr = cvErrorStr(code);            \r
                 const char* function = func ? func : "unknown function";    \r
 \r
-                std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;\r
-                std::cerr.flush();            \r
+                cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;\r
+                cerr.flush();            \r
             }\r
             else    \r
                 cv::error( cv::Exception(code, error_string, func, file, line) );\r
         }\r
+\r
+        void nppError(int code, const char *file, const int line, const char *func)\r
+        {\r
+            string msg = getErrorString(code, npp_errors, npp_error_num);\r
+            cv::gpu::error(msg.c_str(), file, line, func);\r
+        }\r
+\r
+        void ncvError(int code, const char *file, const int line, const char *func)\r
+        {\r
+            string msg = getErrorString(code, ncv_errors, ncv_error_num);\r
+            cv::gpu::error(msg.c_str(), file, line, func);\r
+        }\r
+\r
+        void cufftError(int code, const char *file, const int line, const char *func)\r
+        {\r
+            string msg = getErrorString(code, cufft_errors, cufft_error_num);\r
+            cv::gpu::error(msg.c_str(), file, line, func);\r
+        }\r
     }\r
 }\r
 \r
index 0aa88e8..528ac65 100644 (file)
@@ -48,8 +48,8 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)\r
 \r
 void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); }\r
-void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }\r
-void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }\r
+void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }\r
+void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }\r
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }\r
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }\r
 void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }\r
@@ -82,14 +82,16 @@ void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nog
 void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }\r
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }\r
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }\r
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }\r
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }\r
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }\r
-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }\r
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool) { throw_nogpu(); }\r
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }\r
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }\r
+void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); }\r
+void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); }\r
+void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }\r
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }\r
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }\r
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }\r
+void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }\r
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }\r
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }\r
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }\r
@@ -151,10 +153,10 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
 \r
 namespace cv { namespace gpu {  namespace imgproc\r
 {\r
-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps);\r
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
 }}}\r
 \r
-void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)\r
+void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)\r
 {\r
     if( src.empty() )\r
         CV_Error( CV_StsBadArg, "The input image is empty" );\r
@@ -174,7 +176,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
         eps = 1.f;\r
     eps = (float)std::max(criteria.epsilon, 0.0);\r
 \r
-    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);\r
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////\r
@@ -182,10 +184,10 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
 \r
 namespace cv { namespace gpu {  namespace imgproc\r
 {\r
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps);\r
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
 }}}\r
 \r
-void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)\r
+void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)\r
 {\r
     if( src.empty() )\r
         CV_Error( CV_StsBadArg, "The input image is empty" );\r
@@ -206,7 +208,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
         eps = 1.f;\r
     eps = (float)std::max(criteria.epsilon, 0.0);\r
 \r
-    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);\r
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////\r
@@ -766,14 +768,14 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );\r
 \r
     Ncv32u bufSize;\r
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );\r
+    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );\r
     ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);\r
 \r
     cudaStream_t stream = StreamAccessor::getStream(s);\r
 \r
     NppStStreamHandler h(stream);\r
 \r
-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), \r
+    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), \r
         sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );\r
 \r
     if (stream == 0)\r
@@ -819,7 +821,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );\r
 \r
     Ncv32u bufSize;\r
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));      \r
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));      \r
     GpuMat buf(1, bufSize, CV_8U);\r
 \r
     cudaStream_t stream = StreamAccessor::getStream(s);\r
@@ -827,7 +829,7 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
     NppStStreamHandler h(stream);\r
 \r
     sqsum.create(src.rows + 1, src.cols + 1, CV_64F);\r
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), \r
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), \r
             sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));\r
 \r
     if (stream == 0)\r
@@ -1260,16 +1262,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 \r
 namespace cv { namespace gpu { namespace imgproc {\r
 \r
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);\r
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);\r
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type);\r
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);\r
+    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
+    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
 \r
 }}}\r
 \r
 namespace \r
 {\r
     template <typename T>\r
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)\r
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)\r
     {   \r
         double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;\r
         if (ksize < 0) \r
@@ -1283,25 +1285,25 @@ namespace
 \r
         if (ksize > 0)\r
         {\r
-            Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, borderType);\r
-            Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, borderType);\r
+            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);\r
+            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);\r
         }\r
         else\r
         {\r
-            Scharr(src, Dx, CV_32F, 1, 0, scale, borderType);\r
-            Scharr(src, Dy, CV_32F, 0, 1, scale, borderType);\r
+            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);\r
+            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);\r
         }\r
     }\r
 \r
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)\r
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)\r
     {\r
         switch (src.type())\r
         {\r
         case CV_8U:\r
-            extractCovData<unsigned char>(src, Dx, Dy, blockSize, ksize, borderType);\r
+            extractCovData<unsigned char>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);\r
             break;\r
         case CV_32F:\r
-            extractCovData<float>(src, Dx, Dy, blockSize, ksize, borderType);\r
+            extractCovData<float>(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);\r
             break;\r
         default:\r
             CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix");\r
@@ -1344,15 +1346,21 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ks
 \r
 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)\r
 {\r
+    GpuMat buf;\r
+    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);\r
+}\r
+\r
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)\r
+{\r
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
               borderType == cv::BORDER_REPLICATE);\r
 \r
     int gpuBorderType;\r
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));\r
 \r
-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);\r
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);\r
     dst.create(src.size(), CV_32F);\r
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType);\r
+    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
 }\r
 \r
 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)\r
@@ -1362,6 +1370,12 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, i
 }\r
 \r
 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)\r
+{\r
+    GpuMat buf;\r
+    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);\r
+}\r
+\r
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)\r
 {  \r
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
               borderType == cv::BORDER_REPLICATE);\r
@@ -1369,9 +1383,9 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
     int gpuBorderType;\r
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));\r
 \r
-    extractCovData(src, Dx, Dy, blockSize, ksize, borderType);    \r
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);    \r
     dst.create(src.size(), CV_32F);\r
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType);\r
+    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
 }\r
 \r
 //////////////////////////////////////////////////////////////////////////////\r
@@ -1379,21 +1393,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 \r
 namespace cv { namespace gpu { namespace imgproc \r
 {\r
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-                      DevMem2D_<cufftComplex> c);\r
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
 \r
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, \r
-                           DevMem2D_<cufftComplex> c);\r
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
 }}}\r
 \r
 \r
-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, \r
-                           int flags, bool conjB) \r
+void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) \r
 {\r
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, \r
-                           DevMem2D_<cufftComplex>);\r
-    static Caller callers[] = { imgproc::mulSpectrums, \r
-                                imgproc::mulSpectrums_CONJ };\r
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
+    static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };\r
 \r
     CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);\r
     CV_Assert(a.size() == b.size());\r
@@ -1401,7 +1410,7 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
     c.create(a.size(), CV_32FC2);\r
 \r
     Caller caller = callers[(int)conjB];\r
-    caller(a, b, c);\r
+    caller(a, b, c, StreamAccessor::getStream(stream));\r
 }\r
 \r
 //////////////////////////////////////////////////////////////////////////////\r
@@ -1409,21 +1418,16 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
 \r
 namespace cv { namespace gpu { namespace imgproc \r
 {\r
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-                             float scale, DevMem2D_<cufftComplex> c);\r
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
 \r
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,\r
-                                  float scale, DevMem2D_<cufftComplex> c);\r
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
 }}}\r
 \r
 \r
-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,\r
-                                  int flags, float scale, bool conjB) \r
+void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) \r
 {\r
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>,\r
-                           float scale, DevMem2D_<cufftComplex>);\r
-    static Caller callers[] = { imgproc::mulAndScaleSpectrums, \r
-                                imgproc::mulAndScaleSpectrums_CONJ };\r
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
+    static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };\r
 \r
     CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);\r
     CV_Assert(a.size() == b.size());\r
@@ -1431,14 +1435,26 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
     c.create(a.size(), CV_32FC2);\r
 \r
     Caller caller = callers[(int)conjB];\r
-    caller(a, b, scale, c);\r
+    caller(a, b, scale, c, StreamAccessor::getStream(stream));\r
 }\r
 \r
 //////////////////////////////////////////////////////////////////////////////\r
 // dft\r
 \r
-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)\r
+void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)\r
 {\r
+#ifndef HAVE_CUFFT\r
+\r
+    OPENCV_GPU_UNUSED(src);\r
+    OPENCV_GPU_UNUSED(dst);\r
+    OPENCV_GPU_UNUSED(dft_size);\r
+    OPENCV_GPU_UNUSED(flags);\r
+    OPENCV_GPU_UNUSED(stream);\r
+\r
+    throw_nogpu();\r
+\r
+#else\r
+\r
     CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);\r
 \r
     // We don't support unpacked output (in the case of real input)\r
@@ -1483,6 +1499,8 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
     else\r
         cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);\r
 \r
+    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );\r
+\r
     if (is_complex_input)\r
     {\r
         if (is_complex_output)\r
@@ -1514,7 +1532,9 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags)
     cufftSafeCall(cufftDestroy(plan));\r
 \r
     if (is_scaled_dft)\r
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst);\r
+        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);\r
+\r
+#endif\r
 }\r
 \r
 //////////////////////////////////////////////////////////////////////////////\r
@@ -1563,8 +1583,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size)
 }\r
 \r
 \r
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, \r
-                       bool ccorr)\r
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)\r
 {\r
     ConvolveBuf buf;\r
     convolve(image, templ, result, ccorr, buf);\r
@@ -1572,12 +1591,37 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 \r
 namespace cv { namespace gpu { namespace imgproc\r
 {\r
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel);\r
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);\r
 }}}\r
 \r
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, \r
-                       bool ccorr, ConvolveBuf& buf)\r
+void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)\r
 {\r
+#ifndef HAVE_CUFFT\r
+\r
+    CV_Assert(image.type() == CV_32F);\r
+    CV_Assert(templ.type() == CV_32F);\r
+    CV_Assert(templ.cols <= 17 && templ.rows <= 17);\r
+    \r
+    result.create(image.size(), CV_32F);\r
+\r
+    GpuMat& contKernel = buf.templ_block;\r
+\r
+    if (templ.isContinuous())\r
+        contKernel = templ;\r
+    else\r
+    {\r
+        contKernel = createContinuous(templ.size(), templ.type());\r
+\r
+        if (stream)\r
+            stream.enqueueCopy(templ, contKernel);\r
+        else\r
+            templ.copyTo(contKernel);\r
+    }\r
+\r
+    imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
+\r
+#else\r
+\r
     StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();\r
     StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();\r
 \r
@@ -1587,77 +1631,91 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     if (templ.cols < 13 && templ.rows < 13)\r
     {\r
         result.create(image.size(), CV_32F);\r
-        GpuMat contKernel;\r
+\r
+        GpuMat& contKernel = buf.templ_block;\r
 \r
         if (templ.isContinuous())\r
             contKernel = templ;\r
         else\r
         {\r
             contKernel = createContinuous(templ.size(), templ.type());\r
-            templ.copyTo(contKernel);\r
-        }\r
 \r
-        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>());\r
+            if (stream)\r
+                stream.enqueueCopy(templ, contKernel);\r
+            else\r
+                templ.copyTo(contKernel);\r
+        }\r
 \r
-        return;\r
+        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
     }\r
+    else\r
+    {\r
+        buf.create(image.size(), templ.size());\r
+        result.create(buf.result_size, CV_32F);\r
 \r
-    buf.create(image.size(), templ.size());\r
-    result.create(buf.result_size, CV_32F);\r
+        Size& block_size = buf.block_size;\r
+        Size& dft_size = buf.dft_size;\r
 \r
-    Size& block_size = buf.block_size;\r
-    Size& dft_size = buf.dft_size;\r
+        GpuMat& image_block = buf.image_block;\r
+        GpuMat& templ_block = buf.templ_block;\r
+        GpuMat& result_data = buf.result_data;\r
 \r
-    GpuMat& image_block = buf.image_block;\r
-    GpuMat& templ_block = buf.templ_block;\r
-    GpuMat& result_data = buf.result_data;\r
+        GpuMat& image_spect = buf.image_spect;\r
+        GpuMat& templ_spect = buf.templ_spect;\r
+        GpuMat& result_spect = buf.result_spect;\r
 \r
-    GpuMat& image_spect = buf.image_spect;\r
-    GpuMat& templ_spect = buf.templ_spect;\r
-    GpuMat& result_spect = buf.result_spect;\r
+        cufftHandle planR2C, planC2R;\r
+        cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));\r
+        cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));\r
 \r
-    cufftHandle planR2C, planC2R;\r
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));\r
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));\r
+        cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );\r
+        cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );\r
 \r
-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);\r
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, \r
-                   templ_block.cols - templ_roi.cols, 0);\r
+        GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);\r
+        copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, \r
+                       templ_block.cols - templ_roi.cols, 0, Scalar(), stream);\r
 \r
-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), \r
-                               templ_spect.ptr<cufftComplex>()));\r
+        cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), \r
+                                   templ_spect.ptr<cufftComplex>()));\r
 \r
-    // Process all blocks of the result matrix\r
-    for (int y = 0; y < result.rows; y += block_size.height)\r
-    {\r
-        for (int x = 0; x < result.cols; x += block_size.width)\r
+        // Process all blocks of the result matrix\r
+        for (int y = 0; y < result.rows; y += block_size.height)\r
         {\r
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,\r
-                                std::min(y + dft_size.height, image.rows) - y);\r
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), \r
-                             image.step);\r
-            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,\r
-                           0, image_block.cols - image_roi.cols, 0);\r
-\r
-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), \r
-                                       image_spect.ptr<cufftComplex>()));\r
-            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,\r
-                                 1.f / dft_size.area(), ccorr);\r
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), \r
-                                       result_data.ptr<cufftReal>()));\r
-\r
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,\r
-                                 std::min(y + block_size.height, result.rows) - y);\r
-            GpuMat result_roi(result_roi_size, result.type(), \r
-                              (void*)(result.ptr<float>(y) + x), result.step);\r
-            GpuMat result_block(result_roi_size, result_data.type(), \r
-                                result_data.ptr(), result_data.step);\r
-            result_block.copyTo(result_roi);\r
+            for (int x = 0; x < result.cols; x += block_size.width)\r
+            {\r
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,\r
+                                    std::min(y + dft_size.height, image.rows) - y);\r
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), \r
+                                 image.step);\r
+                copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,\r
+                               0, image_block.cols - image_roi.cols, 0, Scalar(), stream);\r
+\r
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), \r
+                                           image_spect.ptr<cufftComplex>()));\r
+                mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,\r
+                                     1.f / dft_size.area(), ccorr, stream);\r
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), \r
+                                           result_data.ptr<cufftReal>()));\r
+\r
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,\r
+                                     std::min(y + block_size.height, result.rows) - y);\r
+                GpuMat result_roi(result_roi_size, result.type(), \r
+                                  (void*)(result.ptr<float>(y) + x), result.step);\r
+                GpuMat result_block(result_roi_size, result_data.type(), \r
+                                    result_data.ptr(), result_data.step);\r
+\r
+                if (stream)\r
+                    stream.enqueueCopy(result_block, result_roi);\r
+                else\r
+                    result_block.copyTo(result_roi);\r
+            }\r
         }\r
+\r
+        cufftSafeCall(cufftDestroy(planR2C));\r
+        cufftSafeCall(cufftDestroy(planC2R));\r
     }\r
 \r
-    cufftSafeCall(cufftDestroy(planR2C));\r
-    cufftSafeCall(cufftDestroy(planC2R));\r
+#endif\r
 }\r
 \r
 //////////////////////////////////////////////////////////////////////////////\r
index 58bc8bb..e74d0fd 100644 (file)
@@ -47,43 +47,32 @@ using namespace cv::gpu;
 \r
 #if !defined (HAVE_CUDA)\r
 \r
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }\r
+void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }\r
 \r
 #else\r
 \r
 namespace cv { namespace gpu { namespace imgproc \r
 {  \r
-    void matchTemplateNaive_CCORR_8U(\r
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);\r
+    void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
+    void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
 \r
-    void matchTemplateNaive_CCORR_32F(\r
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);\r
+    void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
+    void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
 \r
-    void matchTemplateNaive_SQDIFF_8U(\r
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);\r
+    void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
+        int cn, cudaStream_t stream);\r
 \r
-    void matchTemplateNaive_SQDIFF_32F(\r
-            const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn);\r
-\r
-    void matchTemplatePrepared_SQDIFF_8U(\r
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-            unsigned int templ_sqsum, DevMem2Df result, int cn);\r
-\r
-    void matchTemplatePrepared_SQDIFF_NORMED_8U(\r
-            int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-            unsigned int templ_sqsum, DevMem2Df result, int cn);\r
-\r
-    void matchTemplatePrepared_CCOFF_8U(\r
-            int w, int h, const DevMem2D_<unsigned int> image_sum,\r
-            unsigned int templ_sum, DevMem2Df result);\r
+    void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, \r
+        int cn, cudaStream_t stream);\r
 \r
+    void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_8UC2(\r
-            int w, int h, \r
-            const DevMem2D_<unsigned int> image_sum_r, \r
-            const DevMem2D_<unsigned int> image_sum_g,\r
-            unsigned int templ_sum_r, unsigned int templ_sum_g, \r
-            DevMem2Df result);\r
-\r
+        int w, int h,\r
+        const DevMem2D_<unsigned int> image_sum_r, \r
+        const DevMem2D_<unsigned int> image_sum_g, \r
+        unsigned int templ_sum_r,\r
+        unsigned int templ_sum_g, \r
+        DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_8UC3(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, \r
@@ -92,8 +81,7 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_r, \r
             unsigned int templ_sum_g, \r
             unsigned int templ_sum_b, \r
-            DevMem2Df result);\r
-\r
+            DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_8UC4(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, \r
@@ -104,22 +92,21 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_g, \r
             unsigned int templ_sum_b, \r
             unsigned int templ_sum_a, \r
-            DevMem2Df result);\r
+            DevMem2Df result, cudaStream_t stream);\r
+\r
 \r
     void matchTemplatePrepared_CCOFF_NORMED_8U(\r
             int w, int h, const DevMem2D_<unsigned int> image_sum, \r
             const DevMem2D_<unsigned long long> image_sqsum,\r
             unsigned int templ_sum, unsigned int templ_sqsum,\r
-            DevMem2Df result);\r
-\r
+            DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
             const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,\r
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
-            DevMem2Df result);\r
-\r
+            DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
@@ -128,8 +115,7 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_r, unsigned int templ_sqsum_r,\r
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
-            DevMem2Df result);\r
-\r
+            DevMem2Df result, cudaStream_t stream);\r
     void matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
             int w, int h, \r
             const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,\r
@@ -140,12 +126,12 @@ namespace cv { namespace gpu { namespace imgproc
             unsigned int templ_sum_g, unsigned int templ_sqsum_g,\r
             unsigned int templ_sum_b, unsigned int templ_sqsum_b,\r
             unsigned int templ_sum_a, unsigned int templ_sqsum_a,\r
-            DevMem2Df result);\r
+            DevMem2Df result, cudaStream_t stream);\r
 \r
     void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
-                      unsigned int templ_sqsum, DevMem2Df result, int cn);\r
+                      unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);\r
 \r
-    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn);\r
+    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);\r
 }}}\r
 \r
 \r
@@ -186,103 +172,111 @@ namespace
     }\r
 \r
     \r
-    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
         if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))\r
         {\r
-            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels());\r
+            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
             return;\r
         }\r
 \r
         GpuMat result_;\r
-        convolve(image.reshape(1), templ.reshape(1), result_, true);\r
-        imgproc::extractFirstChannel_32F(result_, result, image.channels());\r
+        ConvolveBuf buf;\r
+        convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);\r
+        imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));\r
     }\r
 \r
 \r
-    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))\r
         {\r
             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels());\r
+            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
             return;\r
         }\r
 \r
         GpuMat imagef, templf;\r
-        image.convertTo(imagef, CV_32F);\r
-        templ.convertTo(templf, CV_32F);\r
-        matchTemplate_CCORR_32F(imagef, templf, result);\r
+        if (stream)\r
+        {\r
+            stream.enqueueConvert(image, imagef, CV_32F);\r
+            stream.enqueueConvert(templ, templf, CV_32F);\r
+        }\r
+        else\r
+        {\r
+            image.convertTo(imagef, CV_32F);\r
+            templ.convertTo(templf, CV_32F);\r
+        }\r
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);\r
     }\r
 \r
 \r
-    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, \r
-                                       GpuMat& result)\r
+    void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
-        matchTemplate_CCORR_8U(image, templ, result);\r
+        matchTemplate_CCORR_8U(image, templ, result, stream);\r
 \r
         GpuMat img_sqsum;\r
-        sqrIntegral(image.reshape(1), img_sqsum);\r
+        sqrIntegral(image.reshape(1), img_sqsum, stream);\r
 \r
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
         imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, \r
-                              result, image.channels());\r
+                              result, image.channels(), StreamAccessor::getStream(stream));\r
     }\r
 \r
     \r
-    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels());\r
+        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
     }\r
 \r
 \r
-    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))\r
         {\r
             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels());\r
+            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
             return;\r
         }\r
 \r
         GpuMat img_sqsum;\r
-        sqrIntegral(image.reshape(1), img_sqsum);\r
+        sqrIntegral(image.reshape(1), img_sqsum, stream);\r
 \r
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
 \r
-        matchTemplate_CCORR_8U(image, templ, result);\r
+        matchTemplate_CCORR_8U(image, templ, result, stream);\r
         imgproc::matchTemplatePrepared_SQDIFF_8U(\r
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());\r
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
     }\r
 \r
 \r
-    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         GpuMat img_sqsum;\r
-        sqrIntegral(image.reshape(1), img_sqsum);\r
+        sqrIntegral(image.reshape(1), img_sqsum, stream);\r
 \r
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
 \r
-        matchTemplate_CCORR_8U(image, templ, result);\r
+        matchTemplate_CCORR_8U(image, templ, result, stream);\r
         imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(\r
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels());\r
+                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
     }\r
 \r
 \r
-    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
-        matchTemplate_CCORR_8U(image, templ, result);\r
+        matchTemplate_CCORR_8U(image, templ, result, stream);\r
 \r
         if (image.channels() == 1)\r
         {\r
             GpuMat image_sum;\r
-            integral(image, image_sum);\r
+            integral(image, image_sum, stream);\r
 \r
             unsigned int templ_sum = (unsigned int)sum(templ)[0];\r
             imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, \r
-                                                    image_sum, templ_sum, result);\r
+                                                    image_sum, templ_sum, result, StreamAccessor::getStream(stream));\r
         }\r
         else\r
         {\r
@@ -291,7 +285,7 @@ namespace
 \r
             split(image, images);\r
             for (int i = 0; i < image.channels(); ++i)\r
-                integral(images[i], image_sums[i]);\r
+                integral(images[i], image_sums[i], stream);\r
 \r
             Scalar templ_sum = sum(templ);\r
 \r
@@ -301,19 +295,19 @@ namespace
                 imgproc::matchTemplatePrepared_CCOFF_8UC2(\r
                         templ.cols, templ.rows, image_sums[0], image_sums[1],\r
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],\r
-                        result);\r
+                        result, StreamAccessor::getStream(stream));\r
                 break;\r
             case 3:\r
                 imgproc::matchTemplatePrepared_CCOFF_8UC3(\r
                         templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],\r
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],\r
-                        result);\r
+                        result, StreamAccessor::getStream(stream));\r
                 break;\r
             case 4:\r
                 imgproc::matchTemplatePrepared_CCOFF_8UC4(\r
                         templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],\r
                         (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],\r
-                        (unsigned int)templ_sum[3], result);\r
+                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));\r
                 break;\r
             default:\r
                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");\r
@@ -322,25 +316,34 @@ namespace
     }\r
 \r
 \r
-    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)\r
+    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
     {\r
         GpuMat imagef, templf;\r
-        image.convertTo(imagef, CV_32F);\r
-        templ.convertTo(templf, CV_32F);\r
-        matchTemplate_CCORR_32F(imagef, templf, result);\r
+        if (stream)\r
+        {\r
+            stream.enqueueConvert(image, imagef, CV_32F);\r
+            stream.enqueueConvert(templ, templf, CV_32F);\r
+        }\r
+        else\r
+        {\r
+            image.convertTo(imagef, CV_32F);\r
+            templ.convertTo(templf, CV_32F);\r
+        }\r
+\r
+        matchTemplate_CCORR_32F(imagef, templf, result, stream);\r
 \r
         if (image.channels() == 1)\r
         {\r
             GpuMat image_sum, image_sqsum;\r
-            integral(image, image_sum);\r
-            sqrIntegral(image, image_sqsum);\r
+            integral(image, image_sum, stream);\r
+            sqrIntegral(image, image_sqsum, stream);\r
 \r
             unsigned int templ_sum = (unsigned int)sum(templ)[0];\r
             unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];\r
 \r
             imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(\r
                     templ.cols, templ.rows, image_sum, image_sqsum, \r
-                    templ_sum, templ_sqsum, result);\r
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));\r
         }\r
         else\r
         {\r
@@ -351,8 +354,8 @@ namespace
             split(image, images);\r
             for (int i = 0; i < image.channels(); ++i)\r
             {\r
-                integral(images[i], image_sums[i]);\r
-                sqrIntegral(images[i], image_sqsums[i]);\r
+                integral(images[i], image_sums[i], stream);\r
+                sqrIntegral(images[i], image_sqsums[i], stream);\r
             }\r
 \r
             Scalar templ_sum = sum(templ);\r
@@ -367,7 +370,7 @@ namespace
                         image_sums[1], image_sqsums[1],\r
                         (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],\r
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],\r
-                        result);\r
+                        result, StreamAccessor::getStream(stream));\r
                 break;\r
             case 3:\r
                 imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
@@ -378,7 +381,7 @@ namespace
                         (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0],\r
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],\r
                         (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],\r
-                        result);\r
+                        result, StreamAccessor::getStream(stream));\r
                 break;\r
             case 4:\r
                 imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
@@ -391,7 +394,7 @@ namespace
                         (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1],\r
                         (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2],\r
                         (unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3],\r
-                        result);                \r
+                        result, StreamAccessor::getStream(stream));                \r
                 break;\r
             default:\r
                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");\r
@@ -401,12 +404,12 @@ namespace
 }\r
 \r
 \r
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method)\r
+void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)\r
 {\r
     CV_Assert(image.type() == templ.type());\r
     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);\r
 \r
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&);\r
+    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream);\r
 \r
     static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, \r
                                         ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, \r
@@ -424,7 +427,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re
 \r
     Caller caller = callers[method];\r
     CV_Assert(caller);\r
-    caller(image, templ, result);\r
+    caller(image, templ, result, stream);\r
 }\r
 \r
 #endif\r
index 7891ef5..19754c0 100644 (file)
@@ -59,10 +59,8 @@ namespace
                       NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)\r
     {\r
         NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));\r
-        CV_Assert(gpuCounter.isInitialized());\r
 \r
-        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);\r
-        CV_Assert(ncvStat == NCV_SUCCESS);\r
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );\r
 \r
         return gpuCounter.maxSize();\r
     }\r
@@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f
     ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);\r
 \r
     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());\r
-    CV_Assert(gpuAllocator.isInitialized());\r
     \r
-    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);\r
-    CV_Assert(ncvStat == NCV_SUCCESS);\r
+    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );\r
 }\r
 \r
 void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, \r
@@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons
     state.ppBuffers[4] = bui.ptr<Ncv32f>();\r
     state.ppBuffers[5] = bvi.ptr<Ncv32f>();\r
 \r
-    nppSafeCall( nppiStInterpolateFrames(&state) );\r
+    ncvSafeCall( nppiStInterpolateFrames(&state) );\r
 \r
     if (stream == 0)\r
         cudaSafeCall( cudaDeviceSynchronize() );\r
index ea5259b..3c84c93 100644 (file)
 // the use of this software, even if advised of the possibility of such damage.\r
 //\r
 //M*/\r
+\r
 #ifndef __OPENCV_PRECOMP_H__\r
 #define __OPENCV_PRECOMP_H__\r
 \r
 #if _MSC_VER >= 1200\r
-#pragma warning( disable: 4251 4710 4711 4514 4996 )\r
+    #pragma warning( disable: 4251 4710 4711 4514 4996 )\r
 #endif\r
 \r
 #ifdef HAVE_CVCONFIG_H\r
-#include "cvconfig.h"\r
+    #include "cvconfig.h"\r
 #endif\r
 \r
 #include <iostream>\r
 #include "opencv2/calib3d/calib3d.hpp"\r
 #include "opencv2/core/internal.hpp"\r
 \r
-#if defined(HAVE_CUDA)\r
+#define OPENCV_GPU_UNUSED(x) (void)x\r
+\r
+#ifdef HAVE_CUDA\r
 \r
-    #include "internal_shared.hpp"\r
     #include "cuda_runtime_api.h"\r
-    #include "cufft.h"\r
+    #include "npp.h"\r
+    \r
+    #ifdef HAVE_CUFFT\r
+        #include "cufft.h"\r
+    #endif\r
+\r
+    #ifdef HAVE_CUBLAS\r
+        #include "cublas.h"\r
+    #endif\r
+\r
+    #include "internal_shared.hpp"\r
     #include "opencv2/gpu/stream_accessor.hpp"\r
-    #include "npp.h"    \r
     \r
     #include "nvidia/core/NCV.hpp"\r
     #include "nvidia/NPP_staging/NPP_staging.hpp"\r
     #include "nvidia/NCVHaarObjectDetection.hpp"\r
     #include "nvidia/NCVBroxOpticalFlow.hpp"\r
 \r
-#define CUDART_MINIMUM_REQUIRED_VERSION 4000\r
-#define NPP_MINIMUM_REQUIRED_VERSION 4000\r
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4000\r
+    #define NPP_MINIMUM_REQUIRED_VERSION 4000\r
 \r
-#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)\r
-    #error "Insufficient Cuda Runtime library version, please update it."\r
-#endif\r
+    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)\r
+        #error "Insufficient Cuda Runtime library version, please update it."\r
+    #endif\r
 \r
-#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)\r
-    #error "Insufficient NPP version, please update it."\r
-#endif\r
+    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)\r
+        #error "Insufficient NPP version, please update it."\r
+    #endif\r
 \r
-#if defined(CUDA_ARCH_BIN_OR_PTX_10)\r
-    #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"\r
-#endif\r
+    #if defined(CUDA_ARCH_BIN_OR_PTX_10)\r
+        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"\r
+    #endif\r
 \r
     static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }\r
 \r