#include "opencv2/features2d.hpp"
namespace cv { namespace gpu {
-
-//////////////////////////////// CudaMem ////////////////////////////////
-// CudaMem is limited cv::Mat with page locked memory allocation.
-// Page locked memory is only needed for async and faster coping to GPU.
-// It is convertable to cv::Mat header without reference counting
-// so you can use it with other opencv functions.
-
-// Page-locks the matrix m memory and maps it for the device(s)
-CV_EXPORTS void registerPageLocked(Mat& m);
-// Unmaps the memory of matrix m, and makes it pageable again.
-CV_EXPORTS void unregisterPageLocked(Mat& m);
-
-class CV_EXPORTS CudaMem
-{
-public:
- enum { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
-
- CudaMem();
- CudaMem(const CudaMem& m);
-
- CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
- CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-
- //! creates from cv::Mat with coping data
- explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
- ~CudaMem();
-
- CudaMem& operator = (const CudaMem& m);
-
- //! returns deep copy of the matrix, i.e. the data is copied
- CudaMem clone() const;
-
- //! allocates new matrix data unless the matrix already has specified size and type.
- void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
- void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
- //! decrements reference counter and released memory if needed.
- void release();
-
- //! returns matrix header with disabled reference counting for CudaMem data.
- Mat createMatHeader() const;
- operator Mat() const;
-
- //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
- GpuMat createGpuMatHeader() const;
- operator GpuMat() const;
-
- //returns if host memory can be mapperd to gpu address space;
- static bool canMapHostMemory();
-
- // Please see cv::Mat for descriptions
- bool isContinuous() const;
- size_t elemSize() const;
- size_t elemSize1() const;
- int type() const;
- int depth() const;
- int channels() const;
- size_t step1() const;
- Size size() const;
- bool empty() const;
-
-
- // Please see cv::Mat for descriptions
- int flags;
- int rows, cols;
- size_t step;
-
- uchar* data;
- int* refcount;
-
- uchar* datastart;
- uchar* dataend;
-
- int alloc_type;
-};
-
-//////////////////////////////// CudaStream ////////////////////////////////
-// Encapculates Cuda Stream. Provides interface for async coping.
-// Passed to each function that supports async kernel execution.
-// Reference counting is enabled
-
-class CV_EXPORTS Stream
-{
-public:
- Stream();
- ~Stream();
-
- Stream(const Stream&);
- Stream& operator =(const Stream&);
-
- bool queryIfComplete();
- void waitForCompletion();
-
- //! downloads asynchronously
- // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
- void enqueueDownload(const GpuMat& src, CudaMem& dst);
- void enqueueDownload(const GpuMat& src, Mat& dst);
-
- //! uploads asynchronously
- // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
- void enqueueUpload(const CudaMem& src, GpuMat& dst);
- void enqueueUpload(const Mat& src, GpuMat& dst);
-
- //! copy asynchronously
- void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
- //! memory set asynchronously
- void enqueueMemSet(GpuMat& src, Scalar val);
- void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
-
- //! converts matrix type, ex from float to uchar depending on type
- void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
-
- //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
- typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
- void enqueueHostCallback(StreamCallback callback, void* userData);
-
- static Stream& Null();
-
- operator bool() const;
-
-private:
- struct Impl;
-
- explicit Stream(Impl* impl);
- void create();
- void release();
-
- Impl *impl;
-
- friend struct StreamAccessor;
-};
-
-
//////////////////////////////// Filter Engine ////////////////////////////////
/*!
friend class CascadeClassifier_GPU_LBP;
};
-// ======================== GPU version for soft cascade ===================== //
-
-class CV_EXPORTS ChannelsProcessor
-{
-public:
- enum
- {
- GENERIC = 1 << 4,
- SEPARABLE = 2 << 4
- };
-
- // Appends specified number of HOG first-order features integrals into given vector.
- // Param frame is an input 3-channel bgr image.
- // Param channels is a GPU matrix of optionally shrinked channels
- // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
- virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
-
- // Creates a specific preprocessor implementation.
- // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
- // Param bins is a number of HOG-like channels.
- // Param flags is a channel computing extra flags.
- static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
-
- virtual ~ChannelsProcessor();
-
-protected:
- ChannelsProcessor();
-};
-
-// Implementation of soft (stage-less) cascaded detector.
-class CV_EXPORTS SCascade : public cv::Algorithm
-{
-public:
-
- // Representation of detectors result.
- struct CV_EXPORTS Detection
- {
- ushort x;
- ushort y;
- ushort w;
- ushort h;
- float confidence;
- int kind;
-
- enum {PEDESTRIAN = 0};
- };
-
- enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
-
- // An empty cascade will be created.
- // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
- // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
- // Param scales is a number of scales from minScale to maxScale.
- // Param flags is an extra tuning flags.
- SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
- const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
-
- virtual ~SCascade();
-
- cv::AlgorithmInfo* info() const;
-
- // Load cascade from FileNode.
- // Param fn is a root node for cascade. Should be <cascade>.
- virtual bool load(const FileNode& fn);
-
- // Load cascade config.
- virtual void read(const FileNode& fn);
-
- // Return the matrix of of detected objects.
- // Param image is a frame on which detector will be applied.
- // Param rois is a regions of interests mask generated by genRoi.
- // Only the objects that fall into one of the regions will be returned.
- // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
- // The first element of the matrix is actually a count of detections.
- // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
- virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-
-private:
-
- struct Fields;
- Fields* fields;
-
- double minScale;
- double maxScale;
- int scales;
-
- int flags;
-};
-
-CV_EXPORTS bool initModule_gpu(void);
-
////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_GPU
#include "opencv2/core/cuda_devptrs.hpp"
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
#if defined(__GNUC__)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
- // if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+ if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
}
__host__ __device__ __forceinline__ int divUp(int total, int grain)
B2Y = 1868
};
- template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(uint src)
+ template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(unsigned char b, unsigned char g, unsigned char r)
{
- uint b = 0xffu & (src >> (bidx * 8));
- uint g = 0xffu & (src >> 8);
- uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+ // uint b = 0xffu & (src >> (bidx * 8));
+ // uint g = 0xffu & (src >> 8);
+ // uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
}
- void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
+ __global__ void device_transform(const cv::gpu::PtrStepSz<uchar3> bgr, cv::gpu::PtrStepSzb gray)
{
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+ const uchar3 colored = (uchar3)(bgr.ptr(y))[x];
+
+ gray.ptr(y)[x] = RGB2GrayConvert<0>(colored.x, colored.y, colored.z);
+ }
+ ///////
+ void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
+ {
+ const dim3 block(32, 8);
+ const dim3 grid(divUp(bgr.cols, block.x), divUp(bgr.rows, block.y));
+ device_transform<<<grid, block>>>(bgr, gray);
+ cudaSafeCall(cudaDeviceSynchronize());
}
}}}
\ No newline at end of file
flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
- device::shrink(rois, flds.genRoiTmp);
+ device::shrink(rois, flds.mask);
//cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
if (type == CV_8UC3)
virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
{
- const cv::gpu::GpuMat frame = _frame.getGpuMat();
+ bgr = _frame.getGpuMat();
//cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
- _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
+ _shrunk.create(bgr.rows * (4 + bins) / shrinkage, bgr.cols / shrinkage, CV_8UC1);
cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
- channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+ channels.create(bgr.rows * (4 + bins), bgr.cols, CV_8UC1);
setZero(channels, s);
+ gray.create(bgr.size(), CV_8UC1);
cv::softcascade::device::transform(bgr, gray); //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);