Merge branch 'master' into gpu-cuda-rename
authorVladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 4 Sep 2013 05:58:32 +0000 (09:58 +0400)
committerVladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 4 Sep 2013 05:58:32 +0000 (09:58 +0400)
Conflicts:
modules/cudaoptflow/perf/perf_optflow.cpp
modules/cudaoptflow/src/tvl1flow.cpp
samples/gpu/stereo_multi.cpp

1  2 
modules/cudaoptflow/perf/perf_optflow.cpp
modules/cudaoptflow/src/cuda/tvl1flow.cu
modules/cudaoptflow/src/tvl1flow.cpp
samples/gpu/stereo_multi.cpp

@@@ -368,8 -368,8 +368,8 @@@ PERF_TEST_P(ImagePair, OpticalFlowDual_
  
          TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
  
-         CUDA_SANITY_CHECK(u, 1e-2);
-         CUDA_SANITY_CHECK(v, 1e-2);
 -        GPU_SANITY_CHECK(u, 1e-1);
 -        GPU_SANITY_CHECK(v, 1e-1);
++        CUDA_SANITY_CHECK(u, 1e-1);
++        CUDA_SANITY_CHECK(v, 1e-1);
      }
      else
      {
@@@ -218,12 -218,24 +218,24 @@@ void cv::cuda::OpticalFlowDual_TVL1_CUD
          warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
  
          double error = std::numeric_limits<double>::max();
+         double prevError = 0.0;
          for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
          {
-             estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
+             // some tweaks to make sum operation less frequently
+             bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
  
-             if (epsilon > 0)
+             estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta), calcError);
+             if (calcError)
+             {
 -                error = gpu::sum(diff, norm_buf)[0];
 +                error = cuda::sum(diff, norm_buf)[0];
+                 prevError = error;
+             }
+             else
+             {
+                 error = std::numeric_limits<double>::max();
+                 prevError -= scaledEpsilon;
+             }
  
              estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
          }
  #endif
  
  #include <iostream>
- #include "cvconfig.h"
+ #include <iomanip>
  #include "opencv2/core.hpp"
  #include "opencv2/highgui.hpp"
 -#include "opencv2/gpustereo.hpp"
+ #include "opencv2/imgproc.hpp"
+ #include "opencv2/contrib.hpp"
 +#include "opencv2/cudastereo.hpp"
  
- #ifdef HAVE_TBB
- #  include "tbb/tbb_stddef.h"
- #  if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
- #    include "tbb/tbb.h"
- #    include "tbb/task.h"
- #    undef min
- #    undef max
- #  else
- #    undef HAVE_TBB
- #  endif
- #endif
+ using namespace std;
+ using namespace cv;
 -using namespace cv::gpu;
++using namespace cv::cuda;
  
- #if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+ ///////////////////////////////////////////////////////////
+ // Thread
+ // OS-specific wrappers for multi-threading
  
- int main()
+ #ifdef WIN32
+ class Thread
  {
- #if !defined(HAVE_CUDA)
-     std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
- #endif
+     struct UserData
+     {
+         void (*func)(void* userData);
+         void* param;
+     };
+     static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
+     {
+         UserData* userData = static_cast<UserData*>(lpParam);
+         userData->func(userData->param);
+         return 0;
+     }
+     UserData userData_;
+     HANDLE thread_;
+     DWORD threadId_;
+ public:
+     Thread(void (*func)(void* userData), void* userData)
+     {
+         userData_.func = func;
+         userData_.param = userData;
+         thread_ = CreateThread(
+             NULL,                   // default security attributes
+             0,                      // use default stack size
+             WinThreadFunction,      // thread function name
+             &userData_,             // argument to thread function
+             0,                      // use default creation flags
+             &threadId_);            // returns the thread identifier
+     }
+     ~Thread()
+     {
+         CloseHandle(thread_);
+     }
+     void wait()
+     {
+         WaitForSingleObject(thread_, INFINITE);
+     }
+ };
+ #else
+ class Thread
+ {
+     struct UserData
+     {
+         void (*func)(void* userData);
+         void* param;
+     };
+     static void* PThreadFunction(void* lpParam)
+     {
+         UserData* userData = static_cast<UserData*>(lpParam);
+         userData->func(userData->param);
+         return 0;
+     }
+     pthread_t thread_;
+     UserData userData_;
+ public:
+     Thread(void (*func)(void* userData), void* userData)
+     {
+         userData_.func = func;
+         userData_.param = userData;
+         pthread_create(&thread_, NULL, PThreadFunction, &userData_);
+     }
  
- #if !defined(HAVE_TBB)
-     std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+     ~Thread()
+     {
+         pthread_detach(thread_);
+     }
+     void wait()
+     {
+         pthread_join(thread_, NULL);
+     }
+ };
  #endif
  
-     return 0;
+ ///////////////////////////////////////////////////////////
+ // StereoSingleGpu
+ // Run Stereo algorithm on single GPU
+ class StereoSingleGpu
+ {
+ public:
+     explicit StereoSingleGpu(int deviceId = 0);
+     ~StereoSingleGpu();
+     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
+ private:
+     int deviceId_;
+     GpuMat d_leftFrame;
+     GpuMat d_rightFrame;
+     GpuMat d_disparity;
 -    Ptr<gpu::StereoBM> d_alg;
++    Ptr<cuda::StereoBM> d_alg;
+ };
+ StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
+ {
 -    gpu::setDevice(deviceId_);
 -    d_alg = gpu::createStereoBM(256);
++    cuda::setDevice(deviceId_);
++    d_alg = cuda::createStereoBM(256);
  }
  
- #else
+ StereoSingleGpu::~StereoSingleGpu()
+ {
 -    gpu::setDevice(deviceId_);
++    cuda::setDevice(deviceId_);
+     d_leftFrame.release();
+     d_rightFrame.release();
+     d_disparity.release();
+     d_alg.release();
+ }
  
- using namespace std;
- using namespace cv;
- using namespace cv::cuda;
+ void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
+ {
 -    gpu::setDevice(deviceId_);
++    cuda::setDevice(deviceId_);
+     d_leftFrame.upload(leftFrame);
+     d_rightFrame.upload(rightFrame);
+     d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
+     d_disparity.download(disparity);
+ }
+ ///////////////////////////////////////////////////////////
+ // StereoMultiGpuThread
+ // Run Stereo algorithm on two GPUs using different host threads
+ class StereoMultiGpuThread
+ {
+ public:
+     StereoMultiGpuThread();
+     ~StereoMultiGpuThread();
+     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
+ private:
+     GpuMat d_leftFrames[2];
+     GpuMat d_rightFrames[2];
+     GpuMat d_disparities[2];
 -    Ptr<gpu::StereoBM> d_algs[2];
++    Ptr<cuda::StereoBM> d_algs[2];
+     struct StereoLaunchData
+     {
+         int deviceId;
+         Mat leftFrame;
+         Mat rightFrame;
+         Mat disparity;
+         GpuMat* d_leftFrame;
+         GpuMat* d_rightFrame;
+         GpuMat* d_disparity;
 -        Ptr<gpu::StereoBM> d_alg;
++        Ptr<cuda::StereoBM> d_alg;
+     };
+     static void launchGpuStereoAlg(void* userData);
+ };
  
- struct Worker { void operator()(int device_id) const; };
+ StereoMultiGpuThread::StereoMultiGpuThread()
+ {
 -    gpu::setDevice(0);
 -    d_algs[0] = gpu::createStereoBM(256);
++    cuda::setDevice(0);
++    d_algs[0] = cuda::createStereoBM(256);
  
- // GPUs data
- GpuMat d_left[2];
- GpuMat d_right[2];
- Ptr<cuda::StereoBM> bm[2];
- GpuMat d_result[2];
 -    gpu::setDevice(1);
 -    d_algs[1] = gpu::createStereoBM(256);
++    cuda::setDevice(1);
++    d_algs[1] = cuda::createStereoBM(256);
+ }
  
static void printHelp()
StereoMultiGpuThread::~StereoMultiGpuThread()
  {
-     std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
 -    gpu::setDevice(0);
++    cuda::setDevice(0);
+     d_leftFrames[0].release();
+     d_rightFrames[0].release();
+     d_disparities[0].release();
+     d_algs[0].release();
 -    gpu::setDevice(1);
++    cuda::setDevice(1);
+     d_leftFrames[1].release();
+     d_rightFrames[1].release();
+     d_disparities[1].release();
+     d_algs[1].release();
  }
  
 -    gpu::setDevice(data->deviceId);
+ void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
+ {
+     disparity.create(leftFrame.size(), CV_8UC1);
+     // Split input data onto two parts for each GPUs.
+     // We add small border for each part,
+     // because original algorithm doesn't calculate disparity on image borders.
+     // With such padding we will get output in the middle of final result.
+     StereoLaunchData launchDatas[2];
+     launchDatas[0].deviceId = 0;
+     launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
+     launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
+     launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
+     launchDatas[0].d_leftFrame = &d_leftFrames[0];
+     launchDatas[0].d_rightFrame = &d_rightFrames[0];
+     launchDatas[0].d_disparity = &d_disparities[0];
+     launchDatas[0].d_alg = d_algs[0];
+     launchDatas[1].deviceId = 1;
+     launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
+     launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
+     launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
+     launchDatas[1].d_leftFrame = &d_leftFrames[1];
+     launchDatas[1].d_rightFrame = &d_rightFrames[1];
+     launchDatas[1].d_disparity = &d_disparities[1];
+     launchDatas[1].d_alg = d_algs[1];
+     Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
+     Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
+     thread0.wait();
+     thread1.wait();
+ }
+ void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
+ {
+     StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
 -    Ptr<gpu::StereoBM> d_algs[2];
++    cuda::setDevice(data->deviceId);
+     data->d_leftFrame->upload(data->leftFrame);
+     data->d_rightFrame->upload(data->rightFrame);
+     data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
+     if (data->deviceId == 0)
+         data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
+     else
+         data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
+ }
+ ///////////////////////////////////////////////////////////
+ // StereoMultiGpuStream
+ // Run Stereo algorithm on two GPUs from single host thread using async API
+ class StereoMultiGpuStream
+ {
+ public:
+     StereoMultiGpuStream();
+     ~StereoMultiGpuStream();
+     void compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity);
+ private:
+     GpuMat d_leftFrames[2];
+     GpuMat d_rightFrames[2];
+     GpuMat d_disparities[2];
 -    gpu::setDevice(0);
 -    d_algs[0] = gpu::createStereoBM(256);
++    Ptr<cuda::StereoBM> d_algs[2];
+     Ptr<Stream> streams[2];
+ };
+ StereoMultiGpuStream::StereoMultiGpuStream()
+ {
 -    gpu::setDevice(1);
 -    d_algs[1] = gpu::createStereoBM(256);
++    cuda::setDevice(0);
++    d_algs[0] = cuda::createStereoBM(256);
+     streams[0] = new Stream;
 -    gpu::setDevice(0);
++    cuda::setDevice(1);
++    d_algs[1] = cuda::createStereoBM(256);
+     streams[1] = new Stream;
+ }
+ StereoMultiGpuStream::~StereoMultiGpuStream()
+ {
 -    gpu::setDevice(1);
++    cuda::setDevice(0);
+     d_leftFrames[0].release();
+     d_rightFrames[0].release();
+     d_disparities[0].release();
+     d_algs[0].release();
+     streams[0].release();
 -    gpu::setDevice(0);
++    cuda::setDevice(1);
+     d_leftFrames[1].release();
+     d_rightFrames[1].release();
+     d_disparities[1].release();
+     d_algs[1].release();
+     streams[1].release();
+ }
+ void StereoMultiGpuStream::compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity)
+ {
+     disparity.create(leftFrame.size(), CV_8UC1);
+     // Split input data onto two parts for each GPUs.
+     // We add small border for each part,
+     // because original algorithm doesn't calculate disparity on image borders.
+     // With such padding we will get output in the middle of final result.
+     Mat leftFrameHdr = leftFrame.createMatHeader();
+     Mat rightFrameHdr = rightFrame.createMatHeader();
+     Mat disparityHdr = disparity.createMatHeader();
+     Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
+     Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
 -    gpu::setDevice(1);
++    cuda::setDevice(0);
+     d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
+     d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
+     d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
+     d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
 -    gpu::setDevice(0);
++    cuda::setDevice(1);
+     d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
+     d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
+     d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
+     d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
 -    gpu::setDevice(1);
++    cuda::setDevice(0);
+     streams[0]->waitForCompletion();
++    cuda::setDevice(1);
+     streams[1]->waitForCompletion();
+ }
+ ///////////////////////////////////////////////////////////
+ // main
  int main(int argc, char** argv)
  {
-     if (argc < 5)
+     if (argc != 3)
      {
-         printHelp();
+         cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl;
          return -1;
      }
  
-     int num_devices = getCudaEnabledDeviceCount();
-     if (num_devices < 2)
+     const int numDevices = getCudaEnabledDeviceCount();
+     if (numDevices != 2)
      {
-         std::cout << "Two or more GPUs are required\n";
+         cerr << "Two GPUs are required" << endl;
          return -1;
      }
-     for (int i = 0; i < num_devices; ++i)
-     {
-         cv::cuda::printShortCudaDeviceInfo(i);
  
-         DeviceInfo dev_info(i);
-         if (!dev_info.isCompatible())
+     for (int i = 0; i < numDevices; ++i)
+     {
+         DeviceInfo devInfo(i);
+         if (!devInfo.isCompatible())
          {
-             std::cout << "CUDA module isn't built for GPU #" << i << " ("
-                  << dev_info.name() << ", CC " << dev_info.majorVersion()
-                  << dev_info.minorVersion() << "\n";
 -            cerr << "GPU module was't built for GPU #" << i << " ("
++            cerr << "CUDA module was't built for GPU #" << i << " ("
+                  << devInfo.name() << ", CC " << devInfo.majorVersion()
+                  << devInfo.minorVersion() << endl;
              return -1;
          }
+         printShortCudaDeviceInfo(i);
      }
  
-     // Load input data
-     Mat left, right;
-     for (int i = 1; i < argc; ++i)
+     VideoCapture leftVideo(argv[1]);
+     VideoCapture rightVideo(argv[2]);
+     if (!leftVideo.isOpened())
      {
-         if (string(argv[i]) == "--left")
-         {
-             left = imread(argv[++i], cv::IMREAD_GRAYSCALE);
-             CV_Assert(!left.empty());
-         }
-         else if (string(argv[i]) == "--right")
-         {
-             right = imread(argv[++i], cv::IMREAD_GRAYSCALE);
-             CV_Assert(!right.empty());
-         }
-         else if (string(argv[i]) == "--help")
+          cerr << "Can't open " << argv[1] << " video file" << endl;
+          return -1;
+     }
+     if (!rightVideo.isOpened())
+     {
+          cerr << "Can't open " << argv[2] << " video file" << endl;
+          return -1;
+     }
+     cout << endl;
+     cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
+     cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
+     cout << endl;
+     Mat leftFrame, rightFrame;
+     CudaMem leftGrayFrame, rightGrayFrame;
+     StereoSingleGpu gpu0Alg(0);
+     StereoSingleGpu gpu1Alg(1);
+     StereoMultiGpuThread multiThreadAlg;
+     StereoMultiGpuStream multiStreamAlg;
+     Mat disparityGpu0;
+     Mat disparityGpu1;
+     Mat disparityMultiThread;
+     CudaMem disparityMultiStream;
+     Mat disparityGpu0Show;
+     Mat disparityGpu1Show;
+     Mat disparityMultiThreadShow;
+     Mat disparityMultiStreamShow;
+     TickMeter tm;
+     cout << "-------------------------------------------------------------------" << endl;
+     cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
+     cout << "-------------------------------------------------------------------" << endl;
+     for (int i = 0;; ++i)
+     {
+         leftVideo >> leftFrame;
+         rightVideo >> rightFrame;
+         if (leftFrame.empty() || rightFrame.empty())
+             break;
+         if (leftFrame.size() != rightFrame.size())
          {
-             printHelp();
+             cerr << "Frames have different sizes" << endl;
              return -1;
          }
-     }
  
-     // Split source images for processing on the GPU #0
-     setDevice(0);
-     d_left[0].upload(left.rowRange(0, left.rows / 2));
-     d_right[0].upload(right.rowRange(0, right.rows / 2));
-     bm[0] = cuda::createStereoBM();
-     // Split source images for processing on the GPU #1
-     setDevice(1);
-     d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
-     d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
-     bm[1] = cuda::createStereoBM();
-     // Execute calculation in two threads using two GPUs
-     int devices[] = {0, 1};
-     tbb::parallel_do(devices, devices + 2, Worker());
-     // Release the first GPU resources
-     setDevice(0);
-     imshow("GPU #0 result", Mat(d_result[0]));
-     d_left[0].release();
-     d_right[0].release();
-     d_result[0].release();
-     bm[0].release();
-     // Release the second GPU resources
-     setDevice(1);
-     imshow("GPU #1 result", Mat(d_result[1]));
-     d_left[1].release();
-     d_right[1].release();
-     d_result[1].release();
-     bm[1].release();
-     waitKey();
-     return 0;
- }
+         leftGrayFrame.create(leftFrame.size(), CV_8UC1);
+         rightGrayFrame.create(leftFrame.size(), CV_8UC1);
  
+         cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
+         cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
  
- void Worker::operator()(int device_id) const
- {
-     setDevice(device_id);
+         tm.reset(); tm.start();
+         gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+                         disparityGpu0);
+         tm.stop();
  
-     bm[device_id]->compute(d_left[device_id], d_right[device_id], d_result[device_id]);
+         const double gpu0Time = tm.getTimeMilli();
  
-     std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
-         << "): finished\n";
- }
+         tm.reset(); tm.start();
+         gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+                         disparityGpu1);
+         tm.stop();
  
- #endif
+         const double gpu1Time = tm.getTimeMilli();
+         tm.reset(); tm.start();
+         multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+                                disparityMultiThread);
+         tm.stop();
+         const double multiThreadTime = tm.getTimeMilli();
+         tm.reset(); tm.start();
+         multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
+         tm.stop();
+         const double multiStreamTime = tm.getTimeMilli();
+         cout << "| " << setw(5) << i << " | "
+              << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
+              << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
+              << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
+              << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
+         resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
+         resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
+         resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
+         resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
+         imshow("disparityGpu0", disparityGpu0Show);
+         imshow("disparityGpu1", disparityGpu1Show);
+         imshow("disparityMultiThread", disparityMultiThreadShow);
+         imshow("disparityMultiStream", disparityMultiStreamShow);
+         const int key = waitKey(30) & 0xff;
+         if (key == 27)
+             break;
+     }
+     cout << "-------------------------------------------------------------------" << endl;
+     return 0;
+ }