#endif
#include <iostream>
- #include "cvconfig.h"
+ #include <iomanip>
+
#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
-#include "opencv2/gpustereo.hpp"
+ #include "opencv2/imgproc.hpp"
+ #include "opencv2/contrib.hpp"
+#include "opencv2/cudastereo.hpp"
- #ifdef HAVE_TBB
- # include "tbb/tbb_stddef.h"
- # if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
- # include "tbb/tbb.h"
- # include "tbb/task.h"
- # undef min
- # undef max
- # else
- # undef HAVE_TBB
- # endif
- #endif
+ using namespace std;
+ using namespace cv;
-using namespace cv::gpu;
++using namespace cv::cuda;
- #if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+ ///////////////////////////////////////////////////////////
+ // Thread
+ // OS-specific wrappers for multi-threading
- int main()
+ #ifdef WIN32
+ class Thread
{
- #if !defined(HAVE_CUDA)
- std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
- #endif
+ struct UserData
+ {
+ void (*func)(void* userData);
+ void* param;
+ };
+
+ static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
+ {
+ UserData* userData = static_cast<UserData*>(lpParam);
+
+ userData->func(userData->param);
+
+ return 0;
+ }
+
+ UserData userData_;
+ HANDLE thread_;
+ DWORD threadId_;
+
+ public:
+ Thread(void (*func)(void* userData), void* userData)
+ {
+ userData_.func = func;
+ userData_.param = userData;
+
+ thread_ = CreateThread(
+ NULL, // default security attributes
+ 0, // use default stack size
+ WinThreadFunction, // thread function name
+ &userData_, // argument to thread function
+ 0, // use default creation flags
+ &threadId_); // returns the thread identifier
+ }
+
+ ~Thread()
+ {
+ CloseHandle(thread_);
+ }
+
+ void wait()
+ {
+ WaitForSingleObject(thread_, INFINITE);
+ }
+ };
+ #else
+ class Thread
+ {
+ struct UserData
+ {
+ void (*func)(void* userData);
+ void* param;
+ };
+
+ static void* PThreadFunction(void* lpParam)
+ {
+ UserData* userData = static_cast<UserData*>(lpParam);
+
+ userData->func(userData->param);
+
+ return 0;
+ }
+
+ pthread_t thread_;
+ UserData userData_;
+
+ public:
+ Thread(void (*func)(void* userData), void* userData)
+ {
+ userData_.func = func;
+ userData_.param = userData;
+
+ pthread_create(&thread_, NULL, PThreadFunction, &userData_);
+ }
- #if !defined(HAVE_TBB)
- std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+ ~Thread()
+ {
+ pthread_detach(thread_);
+ }
+
+ void wait()
+ {
+ pthread_join(thread_, NULL);
+ }
+ };
#endif
- return 0;
+ ///////////////////////////////////////////////////////////
+ // StereoSingleGpu
+ // Run Stereo algorithm on single GPU
+
+ class StereoSingleGpu
+ {
+ public:
+ explicit StereoSingleGpu(int deviceId = 0);
+ ~StereoSingleGpu();
+
+ void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
+
+ private:
+ int deviceId_;
+ GpuMat d_leftFrame;
+ GpuMat d_rightFrame;
+ GpuMat d_disparity;
- Ptr<gpu::StereoBM> d_alg;
++ Ptr<cuda::StereoBM> d_alg;
+ };
+
+ StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
+ {
- gpu::setDevice(deviceId_);
- d_alg = gpu::createStereoBM(256);
++ cuda::setDevice(deviceId_);
++ d_alg = cuda::createStereoBM(256);
}
- #else
+ StereoSingleGpu::~StereoSingleGpu()
+ {
- gpu::setDevice(deviceId_);
++ cuda::setDevice(deviceId_);
+ d_leftFrame.release();
+ d_rightFrame.release();
+ d_disparity.release();
+ d_alg.release();
+ }
- using namespace std;
- using namespace cv;
- using namespace cv::cuda;
+ void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
+ {
- gpu::setDevice(deviceId_);
++ cuda::setDevice(deviceId_);
+ d_leftFrame.upload(leftFrame);
+ d_rightFrame.upload(rightFrame);
+ d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
+ d_disparity.download(disparity);
+ }
+
+ ///////////////////////////////////////////////////////////
+ // StereoMultiGpuThread
+ // Run Stereo algorithm on two GPUs using different host threads
+
+ class StereoMultiGpuThread
+ {
+ public:
+ StereoMultiGpuThread();
+ ~StereoMultiGpuThread();
+
+ void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
+
+ private:
+ GpuMat d_leftFrames[2];
+ GpuMat d_rightFrames[2];
+ GpuMat d_disparities[2];
- Ptr<gpu::StereoBM> d_algs[2];
++ Ptr<cuda::StereoBM> d_algs[2];
+
+ struct StereoLaunchData
+ {
+ int deviceId;
+ Mat leftFrame;
+ Mat rightFrame;
+ Mat disparity;
+ GpuMat* d_leftFrame;
+ GpuMat* d_rightFrame;
+ GpuMat* d_disparity;
- Ptr<gpu::StereoBM> d_alg;
++ Ptr<cuda::StereoBM> d_alg;
+ };
+
+ static void launchGpuStereoAlg(void* userData);
+ };
- struct Worker { void operator()(int device_id) const; };
+ StereoMultiGpuThread::StereoMultiGpuThread()
+ {
- gpu::setDevice(0);
- d_algs[0] = gpu::createStereoBM(256);
++ cuda::setDevice(0);
++ d_algs[0] = cuda::createStereoBM(256);
- // GPUs data
- GpuMat d_left[2];
- GpuMat d_right[2];
- Ptr<cuda::StereoBM> bm[2];
- GpuMat d_result[2];
- gpu::setDevice(1);
- d_algs[1] = gpu::createStereoBM(256);
++ cuda::setDevice(1);
++ d_algs[1] = cuda::createStereoBM(256);
+ }
- static void printHelp()
+ StereoMultiGpuThread::~StereoMultiGpuThread()
{
- std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
- gpu::setDevice(0);
++ cuda::setDevice(0);
+ d_leftFrames[0].release();
+ d_rightFrames[0].release();
+ d_disparities[0].release();
+ d_algs[0].release();
+
- gpu::setDevice(1);
++ cuda::setDevice(1);
+ d_leftFrames[1].release();
+ d_rightFrames[1].release();
+ d_disparities[1].release();
+ d_algs[1].release();
}
- gpu::setDevice(data->deviceId);
+ void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
+ {
+ disparity.create(leftFrame.size(), CV_8UC1);
+
+ // Split input data onto two parts for each GPUs.
+ // We add small border for each part,
+ // because original algorithm doesn't calculate disparity on image borders.
+ // With such padding we will get output in the middle of final result.
+
+ StereoLaunchData launchDatas[2];
+
+ launchDatas[0].deviceId = 0;
+ launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
+ launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
+ launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
+ launchDatas[0].d_leftFrame = &d_leftFrames[0];
+ launchDatas[0].d_rightFrame = &d_rightFrames[0];
+ launchDatas[0].d_disparity = &d_disparities[0];
+ launchDatas[0].d_alg = d_algs[0];
+
+ launchDatas[1].deviceId = 1;
+ launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
+ launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
+ launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
+ launchDatas[1].d_leftFrame = &d_leftFrames[1];
+ launchDatas[1].d_rightFrame = &d_rightFrames[1];
+ launchDatas[1].d_disparity = &d_disparities[1];
+ launchDatas[1].d_alg = d_algs[1];
+
+ Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
+ Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
+
+ thread0.wait();
+ thread1.wait();
+ }
+
+ void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
+ {
+ StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
+
- Ptr<gpu::StereoBM> d_algs[2];
++ cuda::setDevice(data->deviceId);
+ data->d_leftFrame->upload(data->leftFrame);
+ data->d_rightFrame->upload(data->rightFrame);
+ data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
+
+ if (data->deviceId == 0)
+ data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
+ else
+ data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
+ }
+
+ ///////////////////////////////////////////////////////////
+ // StereoMultiGpuStream
+ // Run Stereo algorithm on two GPUs from single host thread using async API
+
+ class StereoMultiGpuStream
+ {
+ public:
+ StereoMultiGpuStream();
+ ~StereoMultiGpuStream();
+
+ void compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity);
+
+ private:
+ GpuMat d_leftFrames[2];
+ GpuMat d_rightFrames[2];
+ GpuMat d_disparities[2];
- gpu::setDevice(0);
- d_algs[0] = gpu::createStereoBM(256);
++ Ptr<cuda::StereoBM> d_algs[2];
+ Ptr<Stream> streams[2];
+ };
+
+ StereoMultiGpuStream::StereoMultiGpuStream()
+ {
- gpu::setDevice(1);
- d_algs[1] = gpu::createStereoBM(256);
++ cuda::setDevice(0);
++ d_algs[0] = cuda::createStereoBM(256);
+ streams[0] = new Stream;
+
- gpu::setDevice(0);
++ cuda::setDevice(1);
++ d_algs[1] = cuda::createStereoBM(256);
+ streams[1] = new Stream;
+ }
+
+ StereoMultiGpuStream::~StereoMultiGpuStream()
+ {
- gpu::setDevice(1);
++ cuda::setDevice(0);
+ d_leftFrames[0].release();
+ d_rightFrames[0].release();
+ d_disparities[0].release();
+ d_algs[0].release();
+ streams[0].release();
+
- gpu::setDevice(0);
++ cuda::setDevice(1);
+ d_leftFrames[1].release();
+ d_rightFrames[1].release();
+ d_disparities[1].release();
+ d_algs[1].release();
+ streams[1].release();
+ }
+
+ void StereoMultiGpuStream::compute(const CudaMem& leftFrame, const CudaMem& rightFrame, CudaMem& disparity)
+ {
+ disparity.create(leftFrame.size(), CV_8UC1);
+
+ // Split input data onto two parts for each GPUs.
+ // We add small border for each part,
+ // because original algorithm doesn't calculate disparity on image borders.
+ // With such padding we will get output in the middle of final result.
+
+ Mat leftFrameHdr = leftFrame.createMatHeader();
+ Mat rightFrameHdr = rightFrame.createMatHeader();
+ Mat disparityHdr = disparity.createMatHeader();
+ Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
+ Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
+
- gpu::setDevice(1);
++ cuda::setDevice(0);
+ d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
+ d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
+ d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
+ d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
+
- gpu::setDevice(0);
++ cuda::setDevice(1);
+ d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
+ d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
+ d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
+ d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
+
- gpu::setDevice(1);
++ cuda::setDevice(0);
+ streams[0]->waitForCompletion();
+
++ cuda::setDevice(1);
+ streams[1]->waitForCompletion();
+ }
+
+ ///////////////////////////////////////////////////////////
+ // main
+
int main(int argc, char** argv)
{
- if (argc < 5)
+ if (argc != 3)
{
- printHelp();
+ cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl;
return -1;
}
- int num_devices = getCudaEnabledDeviceCount();
- if (num_devices < 2)
+ const int numDevices = getCudaEnabledDeviceCount();
+ if (numDevices != 2)
{
- std::cout << "Two or more GPUs are required\n";
+ cerr << "Two GPUs are required" << endl;
return -1;
}
- for (int i = 0; i < num_devices; ++i)
- {
- cv::cuda::printShortCudaDeviceInfo(i);
- DeviceInfo dev_info(i);
- if (!dev_info.isCompatible())
+ for (int i = 0; i < numDevices; ++i)
+ {
+ DeviceInfo devInfo(i);
+ if (!devInfo.isCompatible())
{
- std::cout << "CUDA module isn't built for GPU #" << i << " ("
- << dev_info.name() << ", CC " << dev_info.majorVersion()
- << dev_info.minorVersion() << "\n";
- cerr << "GPU module was't built for GPU #" << i << " ("
++ cerr << "CUDA module was't built for GPU #" << i << " ("
+ << devInfo.name() << ", CC " << devInfo.majorVersion()
+ << devInfo.minorVersion() << endl;
return -1;
}
+
+ printShortCudaDeviceInfo(i);
}
- // Load input data
- Mat left, right;
- for (int i = 1; i < argc; ++i)
+ VideoCapture leftVideo(argv[1]);
+ VideoCapture rightVideo(argv[2]);
+
+ if (!leftVideo.isOpened())
{
- if (string(argv[i]) == "--left")
- {
- left = imread(argv[++i], cv::IMREAD_GRAYSCALE);
- CV_Assert(!left.empty());
- }
- else if (string(argv[i]) == "--right")
- {
- right = imread(argv[++i], cv::IMREAD_GRAYSCALE);
- CV_Assert(!right.empty());
- }
- else if (string(argv[i]) == "--help")
+ cerr << "Can't open " << argv[1] << " video file" << endl;
+ return -1;
+ }
+
+ if (!rightVideo.isOpened())
+ {
+ cerr << "Can't open " << argv[2] << " video file" << endl;
+ return -1;
+ }
+
+ cout << endl;
+ cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
+ cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
+ cout << endl;
+
+ Mat leftFrame, rightFrame;
+ CudaMem leftGrayFrame, rightGrayFrame;
+
+ StereoSingleGpu gpu0Alg(0);
+ StereoSingleGpu gpu1Alg(1);
+ StereoMultiGpuThread multiThreadAlg;
+ StereoMultiGpuStream multiStreamAlg;
+
+ Mat disparityGpu0;
+ Mat disparityGpu1;
+ Mat disparityMultiThread;
+ CudaMem disparityMultiStream;
+
+ Mat disparityGpu0Show;
+ Mat disparityGpu1Show;
+ Mat disparityMultiThreadShow;
+ Mat disparityMultiStreamShow;
+
+ TickMeter tm;
+
+ cout << "-------------------------------------------------------------------" << endl;
+ cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
+ cout << "-------------------------------------------------------------------" << endl;
+
+ for (int i = 0;; ++i)
+ {
+ leftVideo >> leftFrame;
+ rightVideo >> rightFrame;
+
+ if (leftFrame.empty() || rightFrame.empty())
+ break;
+
+ if (leftFrame.size() != rightFrame.size())
{
- printHelp();
+ cerr << "Frames have different sizes" << endl;
return -1;
}
- }
- // Split source images for processing on the GPU #0
- setDevice(0);
- d_left[0].upload(left.rowRange(0, left.rows / 2));
- d_right[0].upload(right.rowRange(0, right.rows / 2));
- bm[0] = cuda::createStereoBM();
-
- // Split source images for processing on the GPU #1
- setDevice(1);
- d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
- d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
- bm[1] = cuda::createStereoBM();
-
- // Execute calculation in two threads using two GPUs
- int devices[] = {0, 1};
- tbb::parallel_do(devices, devices + 2, Worker());
-
- // Release the first GPU resources
- setDevice(0);
- imshow("GPU #0 result", Mat(d_result[0]));
- d_left[0].release();
- d_right[0].release();
- d_result[0].release();
- bm[0].release();
-
- // Release the second GPU resources
- setDevice(1);
- imshow("GPU #1 result", Mat(d_result[1]));
- d_left[1].release();
- d_right[1].release();
- d_result[1].release();
- bm[1].release();
-
- waitKey();
- return 0;
- }
+ leftGrayFrame.create(leftFrame.size(), CV_8UC1);
+ rightGrayFrame.create(leftFrame.size(), CV_8UC1);
+ cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
+ cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
- void Worker::operator()(int device_id) const
- {
- setDevice(device_id);
+ tm.reset(); tm.start();
+ gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+ disparityGpu0);
+ tm.stop();
- bm[device_id]->compute(d_left[device_id], d_right[device_id], d_result[device_id]);
+ const double gpu0Time = tm.getTimeMilli();
- std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
- << "): finished\n";
- }
+ tm.reset(); tm.start();
+ gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+ disparityGpu1);
+ tm.stop();
- #endif
+ const double gpu1Time = tm.getTimeMilli();
+
+ tm.reset(); tm.start();
+ multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
+ disparityMultiThread);
+ tm.stop();
+
+ const double multiThreadTime = tm.getTimeMilli();
+
+ tm.reset(); tm.start();
+ multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
+ tm.stop();
+
+ const double multiStreamTime = tm.getTimeMilli();
+
+ cout << "| " << setw(5) << i << " | "
+ << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
+ << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
+ << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
+ << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
+
+ resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
+ resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
+ resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
+ resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
+
+ imshow("disparityGpu0", disparityGpu0Show);
+ imshow("disparityGpu1", disparityGpu1Show);
+ imshow("disparityMultiThread", disparityMultiThreadShow);
+ imshow("disparityMultiStream", disparityMultiStreamShow);
+
+ const int key = waitKey(30) & 0xff;
+ if (key == 27)
+ break;
+ }
+
+ cout << "-------------------------------------------------------------------" << endl;
+
+ return 0;
+ }