Only for VS2008 now.
Sample for it.
new NPP_staging for VS2008 only
enum NppStStatus\r
{\r
//already present in NPP\r
- /* NPP_SUCCESS = 0, ///< Successful operation (same as NPP_NO_ERROR)\r
- NPP_ERROR = -1, ///< Unknown error\r
- NPP_CUDA_KERNEL_EXECUTION_ERROR = -3, ///< CUDA kernel execution error\r
- NPP_NULL_POINTER_ERROR = -4, ///< NULL pointer argument error\r
- NPP_TEXTURE_BIND_ERROR = -24, ///< CUDA texture binding error or non-zero offset returned\r
- NPP_MEMCPY_ERROR = -13, ///< CUDA memory copy error\r
- NPP_MEM_ALLOC_ERR = -12, ///< CUDA memory allocation error\r
- NPP_MEMFREE_ERR = -15, ///< CUDA memory deallocation error*/\r
+ //NPP_SUCCESS = 0, ///< Successful operation (same as NPP_NO_ERROR)\r
+ //NPP_ERROR = -1, ///< Unknown error\r
+ //NPP_CUDA_KERNEL_EXECUTION_ERROR = -3, ///< CUDA kernel execution error\r
+ //NPP_NULL_POINTER_ERROR = -4, ///< NULL pointer argument error\r
+ //NPP_TEXTURE_BIND_ERROR = -24, ///< CUDA texture binding error or non-zero offset returned\r
+ //NPP_MEMCPY_ERROR = -13, ///< CUDA memory copy error\r
+ //NPP_MEM_ALLOC_ERR = -12, ///< CUDA memory allocation error\r
+ //NPP_MEMFREE_ERR = -15, ///< CUDA memory deallocation error\r
\r
//to be added\r
NPP_INVALID_ROI, ///< Invalid region of interest argument\r
\r
/** \defgroup core_npp NPP Core\r
* Basic functions for CUDA streams management.\r
- * WARNING: These functions couldn't be exported from NPP_staging library, so they can't be used\r
+ * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging\r
* @{\r
*/\r
\r
\r
\r
/**\r
+ * Calculates the size of the temporary buffer for integral image creation\r
+ * \see nppiStIntegralGetSize_8u32u\r
+ */\r
+NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize);\r
+\r
+\r
+/**\r
* Creates an integral image representation for the input image\r
*\r
* \param d_src [IN] Source image pointer (CUDA device memory)\r
\r
\r
/**\r
+ * Creates an integral image representation for the input image\r
+ * \see nppiStIntegral_8u32u_C1R\r
+ */\r
+NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep,\r
+ NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize,\r
+ NppSt8u *pBuffer, NppSt32u bufSize);\r
+\r
+\r
+/**\r
* Creates an integral image representation for the input image. Host implementation\r
*\r
* \param h_src [IN] Source image pointer (Host or pinned memory)\r
\r
\r
/**\r
+ * Creates an integral image representation for the input image. Host implementation\r
+ * \see nppiStIntegral_8u32u_C1R_host\r
+ */\r
+NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,\r
+ NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize);\r
+\r
+\r
+/**\r
* Calculates the size of the temporary buffer for squared integral image creation\r
*\r
* \param roiSize [IN] Size of the input image\r
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
+if (HAVE_CUDA AND MSVC)
+ file(GLOB ncv_srcs "src/nvidia/*.cpp")
+ file(GLOB ncv_hdrs "src/nvidia/*.h*")
+ file(GLOB ncv_cuda "src/nvidia/*.cu")
+ source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+endif()
+
if (HAVE_CUDA)
get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
endif()
+
- CUDA_COMPILE(cuda_objs ${lib_cuda})
+ include(FindNPP_staging.cmake)
+ include_directories(${NPPST_INC})
+
+ CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
-
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs})
-
-IF (HAVE_CUDA)
- include(FindNPP_staging.cmake)
- include_directories(${NPPST_INC})
- target_link_libraries(${the_target} ${NPPST_LIB})
-endif()
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+ target_link_libraries(${the_target} ${NPPST_LIB})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
endif()
explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}\r
};\r
\r
- ////////////////////////////////// CascadeClassifier //////////////////////////////////////////\r
+ ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////\r
// The cascade classifier class for object detection.\r
- class CV_EXPORTS CascadeClassifier\r
+ class CV_EXPORTS CascadeClassifier_GPU\r
{\r
- public:\r
- struct CV_EXPORTS DTreeNode\r
- {\r
- int featureIdx;\r
- float threshold; // for ordered features only\r
- int left;\r
- int right;\r
- };\r
-\r
- struct CV_EXPORTS DTree\r
- {\r
- int nodeCount;\r
- };\r
-\r
- struct CV_EXPORTS Stage\r
- {\r
- int first;\r
- int ntrees;\r
- float threshold;\r
- };\r
-\r
- enum { BOOST = 0 };\r
- enum { DO_CANNY_PRUNING = 1, SCALE_IMAGE = 2,FIND_BIGGEST_OBJECT = 4, DO_ROUGH_SEARCH = 8 };\r
-\r
- CascadeClassifier();\r
- CascadeClassifier(const string& filename);\r
- ~CascadeClassifier();\r
+ public: \r
+ CascadeClassifier_GPU();\r
+ CascadeClassifier_GPU(const string& filename);\r
+ ~CascadeClassifier_GPU();\r
\r
bool empty() const;\r
bool load(const string& filename);\r
- bool read(const FileNode& node);\r
-\r
- void detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor=1.1,\r
- int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size());\r
-\r
- bool setImage( Ptr<FeatureEvaluator>&, const Mat& );\r
- int runAt( Ptr<FeatureEvaluator>&, Point );\r
-\r
- bool isStumpBased;\r
-\r
- int stageType;\r
- int featureType;\r
- int ncategories;\r
- Size origWinSize;\r
-\r
- vector<Stage> stages;\r
- vector<DTree> classifiers;\r
- vector<DTreeNode> nodes;\r
- vector<float> leaves;\r
- vector<int> subsets;\r
+ void release();\r
+ \r
+ /* returns number of detected objects */\r
+ int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());\r
+ \r
+ bool findLargestObject;\r
+ bool visualizeInPlace;\r
\r
- Ptr<FeatureEvaluator> feval;\r
- Ptr<CvHaarClassifierCascade> oldCascade;\r
+ Size getClassifierSize() const;\r
+ private:\r
+ \r
+ struct CascadeClassifierImpl; \r
+ CascadeClassifierImpl* impl; \r
};\r
-\r
+ \r
////////////////////////////////// SURF //////////////////////////////////////////\r
\r
struct CV_EXPORTS SURFParams_GPU \r
{\r
- SURFParams_GPU() :\r
- threshold(0.1f), \r
- nOctaves(4),\r
- nIntervals(4),\r
- initialScale(2.f),\r
-\r
- l1(3.f/1.5f),\r
- l2(5.f/1.5f),\r
- l3(3.f/1.5f),\r
- l4(1.f/1.5f),\r
- edgeScale(0.81f),\r
- initialStep(1),\r
-\r
- extended(true),\r
-\r
- featuresRatio(0.01f)\r
- {\r
- }\r
+ SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), \r
+ l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),\r
+ edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}\r
\r
//! The interest operator threshold\r
float threshold;\r
\r
if (src.type() == CV_8UC1)\r
{\r
- nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,\r
- nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );\r
+ nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );\r
}\r
else\r
{\r
pValues3[1] = nppLut3[1].ptr<Npp32s>();\r
pValues3[2] = nppLut3[2].ptr<Npp32s>();\r
}\r
- nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,\r
- pValues3, lvls.pLevels3, lvls.nValues3) );\r
+ nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );\r
}\r
}\r
\r
\r
#include "precomp.hpp"\r
\r
-\r
-\r
-\r
using namespace cv;\r
using namespace cv::gpu;\r
using namespace std;\r
\r
-#if !defined (HAVE_CUDA)\r
\r
-cv::gpu::CascadeClassifier::CascadeClassifier() { throw_nogpu(); }\r
-cv::gpu::CascadeClassifier::CascadeClassifier(const string&) { throw_nogpu(); }\r
-cv::gpu::CascadeClassifier::~CascadeClassifier() { throw_nogpu(); }\r
+#if !defined (HAVE_CUDA) || (defined(_MSC_VER) && _MSC_VER != 1500) || !defined(_MSC_VER)\r
\r
-bool cv::gpu::CascadeClassifier::empty() const { throw_nogpu(); return true; }\r
-bool cv::gpu::CascadeClassifier::load(const string& filename) { throw_nogpu(); return true; }\r
-bool cv::gpu::CascadeClassifier::read(const FileNode& node) { throw_nogpu(); return true; }\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); }\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string&) { throw_nogpu(); }\r
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { throw_nogpu(); }\r
\r
-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat&, vector<Rect>&, double, int, int, Size, Size) { throw_nogpu(); }\r
+bool cv::gpu::CascadeClassifier_GPU::empty() const { throw_nogpu(); return true; }\r
+bool cv::gpu::CascadeClassifier_GPU::load(const string&) { throw_nogpu(); return true; }\r
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_nogpu(); return Size(); }\r
\r
- \r
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& , GpuMat& , double , int , Size) { throw_nogpu(); return 0; }\r
\r
+#if defined (HAVE_CUDA)\r
+ NCVStatus loadFromXML(const string&, HaarClassifierCascadeDescriptor&, vector<HaarStage64>&, \r
+ vector<HaarClassifierNode128>&, vector<HaarFeature64>&) { throw_nogpu(); return NCVStatus(); }\r
\r
+ void groupRectangles(vector<NcvRect32u>&, int, double, vector<Ncv32u>*) { throw_nogpu(); }\r
+#endif\r
\r
#else\r
\r
+struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl\r
+{ \r
+ CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)\r
+ {\r
+ ncvSetDebugOutputHandler(NCVDebugOutputHandler); \r
+ if (ncvStat != load(filename))\r
+ CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");\r
+ } \r
+ NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors, bool findLargestObject, bool visualizeInPlace, NcvSize32u ncvMinSize, /*out*/unsigned int& numDetections)\r
+ { \r
+ calculateMemReqsAndAllocate(src.size()); \r
\r
-cv::gpu::CascadeClassifier::CascadeClassifier()\r
-{\r
+ NCVMemPtr src_beg;\r
+ src_beg.ptr = (void*)src.ptr<Ncv8u>();\r
+ src_beg.memtype = NCVMemoryTypeDevice;\r
\r
-}\r
+ NCVMemSegment src_seg;\r
+ src_seg.begin = src_beg;\r
+ src_seg.size = src.step * src.rows;\r
\r
-cv::gpu::CascadeClassifier::CascadeClassifier(const string& filename)\r
-{\r
+ NCVMatrixReuse<Ncv8u> d_src(src_seg, devProp.textureAlignment, src.cols, src.rows, src.step, true); \r
+ \r
+ //NCVMatrixAlloc<Ncv8u> d_src(*gpuAllocator, src.cols, src.rows);\r
+ //ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
\r
-}\r
+ //NCVMatrixAlloc<Ncv8u> h_src(*cpuAllocator, src.cols, src.rows);\r
+ //ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
\r
-cv::gpu::CascadeClassifier::~CascadeClassifier()\r
-{\r
+ CV_Assert(objects.rows == 1);\r
+\r
+ NCVMemPtr objects_beg;\r
+ objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();\r
+ objects_beg.memtype = NCVMemoryTypeDevice;\r
+\r
+ NCVMemSegment objects_seg;\r
+ objects_seg.begin = objects_beg;\r
+ objects_seg.size = objects.step * objects.rows;\r
+ NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);\r
+ //NCVVectorAlloc<NcvRect32u> d_rects(*gpuAllocator, 100); \r
+ //ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); \r
+ \r
+ NcvSize32u roi;\r
+ roi.width = d_src.width();\r
+ roi.height = d_src.height();\r
+\r
+ Ncv32u flags = 0;\r
+ flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;\r
+ flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace : 0;\r
+ \r
+ ncvStat = ncvDetectObjectsMultiScale_device(\r
+ d_src, roi, d_rects, numDetections, haar, *h_haarStages,\r
+ *d_haarStages, *d_haarNodes, *d_haarFeatures,\r
+ ncvMinSize,\r
+ minNeighbors,\r
+ scaleStep, 1,\r
+ flags,\r
+ *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+ \r
+ return NCV_SUCCESS;\r
+ }\r
+ ////\r
\r
+ NcvSize32u getClassifierSize() const { return haar.ClassifierSize; }\r
+ cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }\r
+private:\r
+\r
+ static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); }\r
+\r
+ NCVStatus load(const string& classifierFile)\r
+ { \r
+ int devId = cv::gpu::getDevice(); \r
+ ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);\r
+\r
+ // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator\r
+ gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice); \r
+ cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);\r
+\r
+ ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);\r
+ ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);\r
+\r
+ Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;\r
+ ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);\r
+\r
+ h_haarStages = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages); \r
+ h_haarNodes = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);\r
+ h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);\r
+\r
+ ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);\r
+ ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); \r
+ ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);\r
+\r
+ ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);\r
+\r
+ d_haarStages = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);\r
+ d_haarNodes = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);\r
+ d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);\r
+\r
+ ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);\r
+ ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); \r
+ ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);\r
+\r
+ ncvStat = h_haarStages->copySolid(*d_haarStages, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);\r
+ ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);\r
+ ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); \r
+\r
+ return NCV_SUCCESS;\r
+ }\r
+ ////\r
+\r
+ NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)\r
+ { \r
+ if (lastAllocatedFrameSize == frameSize)\r
+ return NCV_SUCCESS;\r
+\r
+ // Calculate memory requirements and create real allocators\r
+ NCVMemStackAllocator gpuCounter(devProp.textureAlignment);\r
+ NCVMemStackAllocator cpuCounter(devProp.textureAlignment);\r
+\r
+ ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR); \r
+ ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);\r
+ \r
+ NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);\r
+ NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);\r
+\r
+ ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); \r
+ ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100); \r
+ ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NcvSize32u roi;\r
+ roi.width = d_src.width();\r
+ roi.height = d_src.height();\r
+ Ncv32u numDetections;\r
+ ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,\r
+ *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);\r
+\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+ \r
+ gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment); \r
+ cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);\r
+\r
+ ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);\r
+ ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR); \r
+ return NCV_SUCCESS;\r
+ }\r
+ //// \r
+\r
+ cudaDeviceProp devProp;\r
+ NCVStatus ncvStat;\r
+\r
+ Ptr<NCVMemNativeAllocator> gpuCascadeAllocator; \r
+ Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;\r
+\r
+ Ptr<NCVVectorAlloc<HaarStage64> > h_haarStages; \r
+ Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;\r
+ Ptr<NCVVectorAlloc<HaarFeature64> > h_haarFeatures;\r
+\r
+ HaarClassifierCascadeDescriptor haar;\r
+\r
+ Ptr<NCVVectorAlloc<HaarStage64> > d_haarStages;\r
+ Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;\r
+ Ptr<NCVVectorAlloc<HaarFeature64> > d_haarFeatures;\r
+\r
+ Size lastAllocatedFrameSize;\r
+\r
+ Ptr<NCVMemStackAllocator> gpuAllocator; \r
+ Ptr<NCVMemStackAllocator> cpuAllocator;\r
+};\r
+\r
+\r
+\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() : findLargestObject(false), visualizeInPlace(false), impl(0) {}\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename) : findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }\r
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }\r
+bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }\r
+\r
+void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }\r
+\r
+bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)\r
+{ \r
+ release();\r
+ impl = new CascadeClassifierImpl(filename);\r
+ return !this->empty(); \r
}\r
\r
-bool cv::gpu::CascadeClassifier::empty() const\r
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const\r
{\r
- int *a = (int*)&nppiStTranspose_32u_C1R;\r
- return *a == 0xFFFFF;\r
- return true;\r
+ return this->empty() ? Size() : impl->getClassifierCvSize();\r
}\r
+ \r
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)\r
+{ \r
+ CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);\r
+ CV_Assert( !this->empty());\r
+ \r
+ const int defaultObjSearchNum = 100;\r
+ if (objectsBuf.empty())\r
+ objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);\r
+ \r
+ NcvSize32u ncvMinSize = impl->getClassifierSize();\r
\r
-bool cv::gpu::CascadeClassifier::load(const string& filename)\r
-{\r
- return true;\r
+ if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height)\r
+ {\r
+ ncvMinSize.width = minSize.width;\r
+ ncvMinSize.height = minSize.height;\r
+ } \r
+ \r
+ unsigned int numDetections;\r
+ NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections); \r
+ if (ncvStat != NCV_SUCCESS)\r
+ CV_Error(CV_GpuApiCallError, "Error in face detectioln");\r
+\r
+ return numDetections;\r
}\r
\r
-bool cv::gpu::CascadeClassifier::read(const FileNode& node)\r
+ struct RectConvert\r
+ {\r
+ Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }\r
+ NcvRect32u operator()(const Rect& nr) const \r
+ { \r
+ NcvRect32u rect;\r
+ rect.x = nr.x;\r
+ rect.y = nr.y;\r
+ rect.width = nr.width;\r
+ rect.height = nr.height;\r
+ return rect; \r
+ }\r
+ };\r
+\r
+ void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)\r
+ {\r
+ vector<Rect> rects(hypotheses.size()); \r
+ std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());\r
+ \r
+ if (weights) \r
+ {\r
+ vector<int> weights_int;\r
+ weights_int.assign(weights->begin(), weights->end()); \r
+ cv::groupRectangles(rects, weights_int, groupThreshold, eps);\r
+ }\r
+ else\r
+ { \r
+ cv::groupRectangles(rects, groupThreshold, eps);\r
+ }\r
+ std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert()); \r
+ hypotheses.resize(rects.size());\r
+ }\r
+\r
+\r
+#if 1 /* loadFromXML implementation switch */\r
+\r
+NCVStatus loadFromXML(const std::string &filename, \r
+ HaarClassifierCascadeDescriptor &haar, \r
+ std::vector<HaarStage64> &haarStages, \r
+ std::vector<HaarClassifierNode128> &haarClassifierNodes, \r
+ std::vector<HaarFeature64> &haarFeatures)\r
{\r
- return true;\r
+ NCVStatus ncvStat;\r
+\r
+ haar.NumStages = 0;\r
+ haar.NumClassifierRootNodes = 0;\r
+ haar.NumClassifierTotalNodes = 0;\r
+ haar.NumFeatures = 0;\r
+ haar.ClassifierSize.width = 0;\r
+ haar.ClassifierSize.height = 0; \r
+ haar.bHasStumpsOnly = true;\r
+ haar.bNeedsTiltedII = false;\r
+ Ncv32u curMaxTreeDepth;\r
+\r
+ std::vector<char> xmlFileCont; \r
+\r
+ std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;\r
+ haarStages.resize(0);\r
+ haarClassifierNodes.resize(0);\r
+ haarFeatures.resize(0); \r
+ \r
+ Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);\r
+ if (oldCascade.empty())\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+ \r
+ haar.ClassifierSize.width = oldCascade->orig_window_size.width;\r
+ haar.ClassifierSize.height = oldCascade->orig_window_size.height;\r
+\r
+ int stagesCound = oldCascade->count;\r
+ for(int s = 0; s < stagesCound; ++s) // by stages\r
+ {\r
+ HaarStage64 curStage;\r
+ curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());\r
+\r
+ curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);\r
+\r
+ int treesCount = oldCascade->stage_classifier[s].count;\r
+ for(int t = 0; t < treesCount; ++t) // bytrees\r
+ { \r
+ Ncv32u nodeId = 0;\r
+ CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];\r
+\r
+ int nodesCount = tree->count;\r
+ for(int n = 0; n < nodesCount; ++n) //by features \r
+ { \r
+ CvHaarFeature* feature = &tree->haar_feature[n];\r
+\r
+ HaarClassifierNode128 curNode; \r
+ curNode.setThreshold(tree->threshold[n]);\r
+ \r
+ HaarClassifierNodeDescriptor32 nodeLeft;\r
+ if ( tree->left[n] <= 0 )\r
+ { \r
+ Ncv32f leftVal = tree->alpha[-tree->left[n]];\r
+ ncvStat = nodeLeft.create(leftVal);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); \r
+ }\r
+ else\r
+ { \r
+ Ncv32u leftNodeOffset = tree->left[n]; \r
+ nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));\r
+ haar.bHasStumpsOnly = false;\r
+ }\r
+ curNode.setLeftNodeDesc(nodeLeft);\r
+ \r
+ HaarClassifierNodeDescriptor32 nodeRight;\r
+ if ( tree->right[n] <= 0 )\r
+ { \r
+ Ncv32f rightVal = tree->alpha[-tree->right[n]]; \r
+ ncvStat = nodeRight.create(rightVal);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+ }\r
+ else\r
+ { \r
+ Ncv32u rightNodeOffset = tree->right[n]; \r
+ nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));\r
+ haar.bHasStumpsOnly = false;\r
+ }\r
+ curNode.setRightNodeDesc(nodeRight); \r
+\r
+ Ncv32u tiltedVal = feature->tilted;\r
+ haar.bNeedsTiltedII = (tiltedVal != 0); \r
+\r
+ Ncv32u featureId = 0; \r
+ for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects\r
+ { \r
+ Ncv32u rectX = feature->rect[l].r.x; \r
+ Ncv32u rectY = feature->rect[l].r.y;\r
+ Ncv32u rectWidth = feature->rect[l].r.width;\r
+ Ncv32u rectHeight = feature->rect[l].r.height;\r
+\r
+ Ncv32f rectWeight = feature->rect[l].weight;\r
+\r
+ if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)\r
+ break;\r
+\r
+ HaarFeature64 curFeature;\r
+ ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);\r
+ curFeature.setWeight(rectWeight);\r
+ ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+ haarFeatures.push_back(curFeature);\r
+\r
+ featureId++;\r
+ }\r
+\r
+ HaarFeatureDescriptor32 tmpFeatureDesc;\r
+ ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);\r
+ ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+ curNode.setFeatureDesc(tmpFeatureDesc);\r
+\r
+ if (!nodeId)\r
+ {\r
+ //root node\r
+ haarClassifierNodes.push_back(curNode);\r
+ curMaxTreeDepth = 1;\r
+ }\r
+ else\r
+ {\r
+ //other node\r
+ h_TmpClassifierNotRootNodes.push_back(curNode);\r
+ curMaxTreeDepth++;\r
+ }\r
+\r
+ nodeId++;\r
+ } \r
+ }\r
+\r
+ curStage.setNumClassifierRootNodes(treesCount);\r
+ haarStages.push_back(curStage); \r
+ }\r
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+\r
+\r
+ //fill in cascade stats\r
+ haar.NumStages = haarStages.size();\r
+ haar.NumClassifierRootNodes = haarClassifierNodes.size();\r
+ haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();\r
+ haar.NumFeatures = haarFeatures.size();\r
+\r
+ //merge root and leaf nodes in one classifiers array\r
+ Ncv32u offsetRoot = haarClassifierNodes.size();\r
+ for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)\r
+ {\r
+ HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();\r
+ if (!nodeLeft.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+ nodeLeft.create(newOffset);\r
+ }\r
+ haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+ HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();\r
+ if (!nodeRight.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+ nodeRight.create(newOffset);\r
+ }\r
+ haarClassifierNodes[i].setRightNodeDesc(nodeRight);\r
+ }\r
+ for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)\r
+ {\r
+ HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();\r
+ if (!nodeLeft.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+ nodeLeft.create(newOffset);\r
+ }\r
+ h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+ HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();\r
+ if (!nodeRight.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+ nodeRight.create(newOffset);\r
+ }\r
+ h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);\r
+\r
+ haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);\r
+ }\r
+\r
+ return NCV_SUCCESS;\r
}\r
\r
-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor, \r
- int minNeighbors, int flags, Size minSize, Size maxSize)\r
+////\r
+\r
+#else /* loadFromXML implementation switch */\r
\r
+#include "e:/devNPP-OpenCV/src/external/_rapidxml-1.13/rapidxml.hpp"\r
+\r
+NCVStatus loadFromXML(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ std::vector<HaarStage64> &haarStages,\r
+ std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+ std::vector<HaarFeature64> &haarFeatures)\r
{\r
+ NCVStatus ncvStat;\r
+\r
+ haar.NumStages = 0;\r
+ haar.NumClassifierRootNodes = 0;\r
+ haar.NumClassifierTotalNodes = 0;\r
+ haar.NumFeatures = 0;\r
+ haar.ClassifierSize.width = 0;\r
+ haar.ClassifierSize.height = 0;\r
+ haar.bNeedsTiltedII = false;\r
+ haar.bHasStumpsOnly = false;\r
+\r
+ FILE *fp;\r
+ fopen_s(&fp, filename.c_str(), "r");\r
+ ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+\r
+ //get file size\r
+ fseek(fp, 0, SEEK_END);\r
+ Ncv32u xmlSize = ftell(fp);\r
+ fseek(fp, 0, SEEK_SET);\r
+\r
+ //load file to vector\r
+ std::vector<char> xmlFileCont;\r
+ xmlFileCont.resize(xmlSize+1);\r
+ memset(&xmlFileCont[0], 0, xmlSize+1);\r
+ fread_s(&xmlFileCont[0], xmlSize, 1, xmlSize, fp);\r
+ fclose(fp);\r
+\r
+ haar.bHasStumpsOnly = true;\r
+ haar.bNeedsTiltedII = false;\r
+ Ncv32u curMaxTreeDepth;\r
+\r
+ std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;\r
+ haarStages.resize(0);\r
+ haarClassifierNodes.resize(0);\r
+ haarFeatures.resize(0);\r
+\r
+ //XML loading and OpenCV XML classifier syntax verification\r
+ try\r
+ {\r
+ rapidxml::xml_document<> doc;\r
+ doc.parse<0>(&xmlFileCont[0]);\r
+\r
+ //opencv_storage\r
+ rapidxml::xml_node<> *parserGlobal = doc.first_node();\r
+ ncvAssertReturn(!strcmp(parserGlobal->name(), "opencv_storage"), NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+ //classifier type\r
+ parserGlobal = parserGlobal->first_node();\r
+ ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ rapidxml::xml_attribute<> *attr = parserGlobal->first_attribute("type_id");\r
+ ncvAssertReturn(!strcmp(attr->value(), "opencv-haar-classifier"), NCV_HAAR_XML_LOADING_EXCEPTION);\r
\r
+ //classifier size\r
+ parserGlobal = parserGlobal->first_node("size");\r
+ ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ sscanf_s(parserGlobal->value(), "%d %d", &(haar.ClassifierSize.width), &(haar.ClassifierSize.height));\r
+\r
+ //parse stages\r
+ parserGlobal = parserGlobal->next_sibling("stages");\r
+ ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ parserGlobal = parserGlobal->first_node("_");\r
+ ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+ while (parserGlobal)\r
+ {\r
+ HaarStage64 curStage;\r
+ curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());\r
+ Ncv32u tmpNumClassifierRootNodes = 0;\r
+\r
+ rapidxml::xml_node<> *parserStageThreshold = parserGlobal->first_node("stage_threshold");\r
+ ncvAssertReturn(parserStageThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32f tmpStageThreshold;\r
+ sscanf_s(parserStageThreshold->value(), "%f", &tmpStageThreshold);\r
+ curStage.setStageThreshold(tmpStageThreshold);\r
+\r
+ //parse trees\r
+ rapidxml::xml_node<> *parserTree;\r
+ parserTree = parserGlobal->first_node("trees");\r
+ ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ parserTree = parserTree->first_node("_");\r
+ ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+ while (parserTree)\r
+ {\r
+ rapidxml::xml_node<> *parserNode;\r
+ parserNode = parserTree->first_node("_");\r
+ ncvAssertReturn(parserNode, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32u nodeId = 0;\r
+\r
+ while (parserNode)\r
+ {\r
+ HaarClassifierNode128 curNode;\r
+\r
+ rapidxml::xml_node<> *parserNodeThreshold = parserNode->first_node("threshold");\r
+ ncvAssertReturn(parserNodeThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32f tmpThreshold;\r
+ sscanf_s(parserNodeThreshold->value(), "%f", &tmpThreshold);\r
+ curNode.setThreshold(tmpThreshold);\r
+\r
+ rapidxml::xml_node<> *parserNodeLeft = parserNode->first_node("left_val");\r
+ HaarClassifierNodeDescriptor32 nodeLeft;\r
+ if (parserNodeLeft)\r
+ {\r
+ Ncv32f leftVal;\r
+ sscanf_s(parserNodeLeft->value(), "%f", &leftVal);\r
+ ncvStat = nodeLeft.create(leftVal);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+ }\r
+ else\r
+ {\r
+ parserNodeLeft = parserNode->first_node("left_node");\r
+ ncvAssertReturn(parserNodeLeft, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32u leftNodeOffset;\r
+ sscanf_s(parserNodeLeft->value(), "%d", &leftNodeOffset);\r
+ nodeLeft.create(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1);\r
+ haar.bHasStumpsOnly = false;\r
+ }\r
+ curNode.setLeftNodeDesc(nodeLeft);\r
+\r
+ rapidxml::xml_node<> *parserNodeRight = parserNode->first_node("right_val");\r
+ HaarClassifierNodeDescriptor32 nodeRight;\r
+ if (parserNodeRight)\r
+ {\r
+ Ncv32f rightVal;\r
+ sscanf_s(parserNodeRight->value(), "%f", &rightVal);\r
+ ncvStat = nodeRight.create(rightVal);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+ }\r
+ else\r
+ {\r
+ parserNodeRight = parserNode->first_node("right_node");\r
+ ncvAssertReturn(parserNodeRight, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32u rightNodeOffset;\r
+ sscanf_s(parserNodeRight->value(), "%d", &rightNodeOffset);\r
+ nodeRight.create(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1);\r
+ haar.bHasStumpsOnly = false;\r
+ }\r
+ curNode.setRightNodeDesc(nodeRight);\r
+\r
+ rapidxml::xml_node<> *parserNodeFeatures = parserNode->first_node("feature");\r
+ ncvAssertReturn(parserNodeFeatures, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+ rapidxml::xml_node<> *parserNodeFeaturesTilted = parserNodeFeatures->first_node("tilted");\r
+ ncvAssertReturn(parserNodeFeaturesTilted, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32u tiltedVal;\r
+ sscanf_s(parserNodeFeaturesTilted->value(), "%d", &tiltedVal);\r
+ haar.bNeedsTiltedII = (tiltedVal != 0);\r
+\r
+ rapidxml::xml_node<> *parserNodeFeaturesRects = parserNodeFeatures->first_node("rects");\r
+ ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ parserNodeFeaturesRects = parserNodeFeaturesRects->first_node("_");\r
+ ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+ Ncv32u featureId = 0;\r
+\r
+ while (parserNodeFeaturesRects)\r
+ {\r
+ Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+ Ncv32f rectWeight;\r
+ sscanf_s(parserNodeFeaturesRects->value(), "%d %d %d %d %f", &rectX, &rectY, &rectWidth, &rectHeight, &rectWeight);\r
+ HaarFeature64 curFeature;\r
+ ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);\r
+ curFeature.setWeight(rectWeight);\r
+ ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+ haarFeatures.push_back(curFeature);\r
+\r
+ parserNodeFeaturesRects = parserNodeFeaturesRects->next_sibling("_");\r
+ featureId++;\r
+ }\r
+\r
+ HaarFeatureDescriptor32 tmpFeatureDesc;\r
+ ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);\r
+ ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+ curNode.setFeatureDesc(tmpFeatureDesc);\r
+\r
+ if (!nodeId)\r
+ {\r
+ //root node\r
+ haarClassifierNodes.push_back(curNode);\r
+ curMaxTreeDepth = 1;\r
+ }\r
+ else\r
+ {\r
+ //other node\r
+ h_TmpClassifierNotRootNodes.push_back(curNode);\r
+ curMaxTreeDepth++;\r
+ }\r
+\r
+ parserNode = parserNode->next_sibling("_");\r
+ nodeId++;\r
+ }\r
+\r
+ parserTree = parserTree->next_sibling("_");\r
+ tmpNumClassifierRootNodes++;\r
+ }\r
+\r
+ curStage.setNumClassifierRootNodes(tmpNumClassifierRootNodes);\r
+ haarStages.push_back(curStage);\r
+\r
+ parserGlobal = parserGlobal->next_sibling("_");\r
+ }\r
+ }\r
+ catch (...)\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+\r
+ //fill in cascade stats\r
+ haar.NumStages = haarStages.size();\r
+ haar.NumClassifierRootNodes = haarClassifierNodes.size();\r
+ haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();\r
+ haar.NumFeatures = haarFeatures.size();\r
+\r
+ //merge root and leaf nodes in one classifiers array\r
+ Ncv32u offsetRoot = haarClassifierNodes.size();\r
+ for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)\r
+ {\r
+ HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();\r
+ if (!nodeLeft.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+ nodeLeft.create(newOffset);\r
+ }\r
+ haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+ HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();\r
+ if (!nodeRight.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+ nodeRight.create(newOffset);\r
+ }\r
+ haarClassifierNodes[i].setRightNodeDesc(nodeRight);\r
+ }\r
+ for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)\r
+ {\r
+ HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();\r
+ if (!nodeLeft.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+ nodeLeft.create(newOffset);\r
+ }\r
+ h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+ HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();\r
+ if (!nodeRight.isLeaf())\r
+ {\r
+ Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+ nodeRight.create(newOffset);\r
+ }\r
+ h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);\r
+\r
+ haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);\r
+ }\r
+\r
+ return NCV_SUCCESS;\r
}\r
\r
-#endif
\ No newline at end of file
+#endif /* loadFromXML implementation switch */\r
+\r
+#endif /* HAVE_CUDA */\r
+\r
+\r
BORDER_REPLICATE_GPU,\r
BORDER_CONSTANT_GPU\r
};\r
- \r
+ \r
// Converts CPU border extrapolation mode into GPU internal analogue.\r
// Returns true if the GPU analogue exists, false otherwise.\r
bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);\r
const textureReference* tex; \r
cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
cudaSafeCall( cudaUnbindTexture(tex) );\r
- } \r
+ } \r
+\r
+ struct KeyPoint_GPU\r
+ {\r
+ float x;\r
+ float y;\r
+ float size;\r
+ float response;\r
+ float angle;\r
+ float octave;\r
+ };\r
\r
+ enum KeypointLayout \r
+ {\r
+ SF_X,\r
+ SF_Y,\r
+ SF_SIZE,\r
+ SF_RESPONSE,\r
+ SF_ANGLE,\r
+ SF_OCTAVE,\r
+ SF_FEATURE_STRIDE\r
+ };\r
}\r
}\r
\r
{\r
namespace gpu\r
{\r
- namespace surf\r
- {\r
- struct KeyPoint_GPU\r
- {\r
- float x;\r
- float y;\r
- float size;\r
- float response;\r
- float angle;\r
- float octave;\r
- };\r
-\r
- enum KeypointLayout \r
- {\r
- SF_X,\r
- SF_Y,\r
- SF_SIZE,\r
- SF_RESPONSE,\r
- SF_ANGLE,\r
- SF_OCTAVE,\r
- SF_FEATURE_STRIDE\r
- };\r
- }\r
+ \r
}\r
}\r
\r
\r
namespace\r
{\r
- typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,\r
- NppiSize oSizeROI, int nScaleFactor);\r
- typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,\r
- int nDstStep, NppiSize oSizeROI);\r
- typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,\r
- int nDstStep, NppiSize oSizeROI);\r
+ typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
+ typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI);\r
+ typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);\r
\r
void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,\r
npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,\r
npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)\r
{\r
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());\r
-\r
CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);\r
-\r
dst.create( src1.size(), src1.type() );\r
\r
NppiSize sz;\r
switch (src1.type())\r
{\r
case CV_8UC1:\r
- nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,\r
- src2.ptr<Npp8u>(), src2.step,\r
- dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
+ nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
break;\r
case CV_8UC4:\r
- nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,\r
- src2.ptr<Npp8u>(), src2.step,\r
- dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
+ nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
break;\r
case CV_32SC1:\r
- nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,\r
- src2.ptr<Npp32s>(), src2.step,\r
- dst.ptr<Npp32s>(), dst.step, sz) );\r
+ nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );\r
break;\r
case CV_32FC1:\r
- nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,\r
- src2.ptr<Npp32f>(), src2.step,\r
- dst.ptr<Npp32f>(), dst.step, sz) );\r
+ nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );\r
break;\r
default:\r
CV_Assert(!"Unsupported source type");\r
template<int SCN> struct NppArithmScalarFunc;\r
template<> struct NppArithmScalarFunc<1>\r
{\r
- typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,\r
- int nDstStep, NppiSize oSizeROI);\r
+ typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);\r
};\r
template<> struct NppArithmScalarFunc<2>\r
{\r
- typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,\r
- int nDstStep, NppiSize oSizeROI);\r
+ typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, int nDstStep, NppiSize oSizeROI);\r
};\r
\r
template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;\r
+\r
template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>\r
{\r
static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)\r
switch (src1.type())\r
{\r
case CV_8UC1:\r
- nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,\r
- src2.ptr<Npp8u>(), src2.step,\r
- dst.ptr<Npp8u>(), dst.step, sz) );\r
+ nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );\r
break;\r
case CV_8UC4:\r
- nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,\r
- src2.ptr<Npp8u>(), src2.step,\r
- dst.ptr<Npp8u>(), dst.step, sz) );\r
+ nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );\r
break;\r
case CV_32SC1:\r
- nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,\r
- src2.ptr<Npp32s>(), src2.step,\r
- dst.ptr<Npp32s>(), dst.step, sz) );\r
+ nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );\r
break;\r
case CV_32FC1:\r
- nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,\r
- src2.ptr<Npp32f>(), src2.step,\r
- dst.ptr<Npp32f>(), dst.step, sz) );\r
+ nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );\r
break;\r
default:\r
CV_Assert(!"Unsupported source type");\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include <cstdio>\r
+#include <cuda_runtime.h>\r
+\r
+#define CV_NO_BACKWARD_COMPATIBILITY\r
+\r
+#include "opencv2/opencv.hpp"\r
+\r
+#include "NCVHaarObjectDetection.hpp"\r
+\r
+using namespace cv;\r
+using namespace std;\r
+\r
+const Size preferredVideoFrameSize(640, 480);\r
+\r
+string preferredClassifier = "haarcascade_frontalface_alt.xml";\r
+string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";\r
+\r
+\r
+void printSyntax(void)\r
+{\r
+ printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");\r
+}\r
+\r
+\r
+void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)\r
+{ \r
+ int fontFace = CV_FONT_HERSHEY_PLAIN;\r
+ double fontScale = 1; \r
+ \r
+ int baseline;\r
+ Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline);\r
+\r
+ va_list arg_ptr;\r
+ va_start(arg_ptr, format);\r
+ int len = _vscprintf(format, arg_ptr) + 1;\r
+ \r
+ vector<char> strBuf(len); \r
+ vsprintf_s(&strBuf[0], len, format, arg_ptr);\r
+\r
+ Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2); \r
+ putText(img, &strBuf[0], org, fontFace, fontScale, color);\r
+ va_end(arg_ptr); \r
+}\r
+\r
+\r
+NCVStatus process(Mat *srcdst,\r
+ Ncv32u width, Ncv32u height,\r
+ NcvBool bShowAllHypotheses, NcvBool bLargestFace,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,\r
+ NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,\r
+ INCVMemAllocator &gpuAllocator,\r
+ INCVMemAllocator &cpuAllocator,\r
+ cudaDeviceProp &devProp)\r
+{\r
+ ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);\r
+\r
+ NCVStatus ncvStat;\r
+\r
+ NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+ NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);\r
+ ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);\r
+ ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100); \r
+ ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());\r
+\r
+ NCV_SKIP_COND_BEGIN \r
+ \r
+ (*srcdst).copyTo(h_src_hdr);\r
+ \r
+ ncvStat = h_src.copySolid(d_src, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ NcvSize32u roi;\r
+ roi.width = d_src.width();\r
+ roi.height = d_src.height();\r
+\r
+ Ncv32u numDetections;\r
+ ncvStat = ncvDetectObjectsMultiScale_device(\r
+ d_src, roi, d_rects, numDetections, haar, h_haarStages,\r
+ d_haarStages, d_haarNodes, d_haarFeatures,\r
+ haar.ClassifierSize,\r
+ bShowAllHypotheses ? 0 : 4,\r
+ 1.2f, 1,\r
+ (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,\r
+ gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ ncvStat = d_src.copySolid(h_src, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+ h_src_hdr.copyTo(*srcdst);\r
+ \r
+ NCV_SKIP_COND_END\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+int main( int argc, const char** argv )\r
+{\r
+ NCVStatus ncvStat;\r
+\r
+ printf("NVIDIA Computer Vision SDK\n");\r
+ printf("Face Detection in video and live feed\n");\r
+ printf("=========================================\n");\r
+ printf(" Esc - Quit\n");\r
+ printf(" Space - Switch between NCV and OpenCV\n");\r
+ printf(" L - Switch between FullSearch and LargestFace modes\n");\r
+ printf(" U - Toggle unfiltered hypotheses visualization in FullSearch\n");\r
+ \r
+ if (argc != 4 && argc != 1)\r
+ return printSyntax(), -1;\r
+\r
+ VideoCapture capture; \r
+ Size frameSize;\r
+\r
+ if (argc == 1 || strcmp(argv[1], "-c") == 0)\r
+ {\r
+ // Camera input is specified\r
+ int camIdx = (argc == 3) ? atoi(argv[2]) : 0;\r
+ if(!capture.open(camIdx)) \r
+ return printf("Error opening camera\n"), -1; \r
+ \r
+ capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width);\r
+ capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height);\r
+ capture.set(CV_CAP_PROP_FPS, 25);\r
+ frameSize = preferredVideoFrameSize;\r
+ }\r
+ else if (strcmp(argv[1], "-v") == 0)\r
+ {\r
+ // Video file input (avi)\r
+ if(!capture.open(argv[2]))\r
+ return printf("Error opening video file\n"), -1;\r
+\r
+ frameSize.width = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH);\r
+ frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT);\r
+ }\r
+ else\r
+ return printSyntax(), -1;\r
+\r
+ NcvBool bUseOpenCV = true;\r
+ NcvBool bLargestFace = true;\r
+ NcvBool bShowAllHypotheses = false; \r
+\r
+ string classifierFile = (argc == 1) ? preferredClassifier : argv[3];\r
+ \r
+ CascadeClassifier classifierOpenCV;\r
+ if (!classifierOpenCV.load(classifierFile))\r
+ return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;\r
+\r
+ int devId;\r
+ ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);\r
+ cudaDeviceProp devProp;\r
+ ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);\r
+ printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor);\r
+\r
+ //==============================================================================\r
+ //\r
+ // Load the classifier from file (assuming its size is about 1 mb)\r
+ // using a simple allocator\r
+ //\r
+ //==============================================================================\r
+\r
+ NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);\r
+ ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);\r
+ NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);\r
+ ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);\r
+\r
+ Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;\r
+ ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);\r
+\r
+ NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);\r
+ ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+ NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);\r
+ ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+ NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);\r
+ ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+\r
+ HaarClassifierCascadeDescriptor haar;\r
+ ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);\r
+\r
+ NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);\r
+ ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+ NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);\r
+ ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+ NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);\r
+ ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+\r
+ ncvStat = h_haarStages.copySolid(d_haarStages, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+ ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+ ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+\r
+ //==============================================================================\r
+ //\r
+ // Calculate memory requirements and create real allocators\r
+ //\r
+ //==============================================================================\r
+\r
+ NCVMemStackAllocator gpuCounter(devProp.textureAlignment);\r
+ ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);\r
+ NCVMemStackAllocator cpuCounter(devProp.textureAlignment);\r
+ ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);\r
+\r
+ ncvStat = process(NULL, frameSize.width, frameSize.height,\r
+ false, false, haar,\r
+ d_haarStages, d_haarNodes,\r
+ d_haarFeatures, h_haarStages,\r
+ gpuCounter, cpuCounter, devProp);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);\r
+\r
+ NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);\r
+ ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);\r
+ NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);\r
+ ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);\r
+\r
+ printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);\r
+\r
+ //==============================================================================\r
+ //\r
+ // Main processing loop\r
+ //\r
+ //==============================================================================\r
+ \r
+ namedWindow(wndTitle, 1);\r
+\r
+ Mat frame, gray, frameDisp;\r
+\r
+ for(;;)\r
+ {\r
+ // For camera and video file, capture the next image \r
+ capture >> frame;\r
+ if (frame.empty())\r
+ break;\r
+ \r
+ cvtColor(frame, gray, CV_BGR2GRAY);\r
+\r
+ // process\r
+ NcvSize32u minSize = haar.ClassifierSize;\r
+ if (bLargestFace)\r
+ {\r
+ Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;\r
+ Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;\r
+ Ncv32u ratioSmallest = std::min(ratioX, ratioY);\r
+ ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);\r
+ minSize.width *= ratioSmallest;\r
+ minSize.height *= ratioSmallest;\r
+ }\r
+ \r
+ NcvTimer timer = ncvStartTimer();\r
+\r
+ if (!bUseOpenCV)\r
+ {\r
+ ncvStat = process(&gray, frameSize.width, frameSize.height,\r
+ bShowAllHypotheses, bLargestFace, haar,\r
+ d_haarStages, d_haarNodes,\r
+ d_haarFeatures, h_haarStages,\r
+ gpuAllocator, cpuAllocator, devProp);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);\r
+ }\r
+ else\r
+ {\r
+ vector<Rect> rectsOpenCV;\r
+\r
+ classifierOpenCV.detectMultiScale(\r
+ gray,\r
+ rectsOpenCV,\r
+ 1.2f,\r
+ bShowAllHypotheses && !bLargestFace ? 0 : 4,\r
+ (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,\r
+ Size(minSize.width, minSize.height));\r
+\r
+ for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)\r
+ rectangle(gray, rectsOpenCV[rt], Scalar(255));\r
+ }\r
+\r
+ Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);\r
+ \r
+ cvtColor(gray, frameDisp, CV_GRAY2BGR);\r
+\r
+ imagePrintf(frameDisp, 0, CV_RGB(255, 0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");\r
+ imagePrintf(frameDisp, 1, CV_RGB(255, 0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":"");\r
+ imagePrintf(frameDisp, 2, CV_RGB(255, 0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)");\r
+ imagePrintf(frameDisp, 3, CV_RGB(118,185,0), " Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU");\r
+\r
+ cv::imshow(wndTitle, frameDisp);\r
+\r
+ switch (cvWaitKey(1))\r
+ {\r
+ case ' ':\r
+ bUseOpenCV = !bUseOpenCV;\r
+ break;\r
+ case 'L':case 'l':\r
+ bLargestFace = !bLargestFace;\r
+ break;\r
+ case 'U':case 'u':\r
+ bShowAllHypotheses = !bShowAllHypotheses;\r
+ break;\r
+ case 27:\r
+ return 0; \r
+ }\r
+ }\r
+ \r
+ return 0;\r
+}\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+\r
+#include <precomp.hpp>\r
+\r
+\r
+#if !defined (HAVE_CUDA)\r
+\r
+\r
+#else /* !defined (HAVE_CUDA) */\r
+\r
+\r
+#include <stdarg.h>\r
+#include "NCV.hpp"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Error handling helpers\r
+//\r
+//==============================================================================\r
+\r
+\r
+static void stdioDebugOutput(const char *msg)\r
+{\r
+ printf("%s", msg);\r
+}\r
+\r
+\r
+static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;\r
+\r
+\r
+void ncvDebugOutput(const char *msg, ...)\r
+{\r
+ const int K_DEBUG_STRING_MAXLEN = 1024;\r
+ char buffer[K_DEBUG_STRING_MAXLEN];\r
+ va_list args;\r
+ va_start(args, msg);\r
+ vsnprintf_s(buffer, K_DEBUG_STRING_MAXLEN, K_DEBUG_STRING_MAXLEN-1, msg, args);\r
+ va_end (args);\r
+ debugOutputHandler(buffer);\r
+}\r
+\r
+\r
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)\r
+{\r
+ debugOutputHandler = func;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Memory wrappers and helpers\r
+//\r
+//==============================================================================\r
+\r
+\r
+NCVStatus GPUAlignmentValue(Ncv32u &alignment)\r
+{\r
+ int curDev;\r
+ cudaDeviceProp curProp;\r
+ ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);\r
+ alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)\r
+{\r
+ Ncv32u alignMask = alignment-1;\r
+ Ncv32u inverseAlignMask = ~alignMask;\r
+ Ncv32u res = (what + alignMask) & inverseAlignMask;\r
+ return res;\r
+}\r
+\r
+\r
+void NCVMemPtr::clear()\r
+{\r
+ ptr = NULL;\r
+ memtype = NCVMemoryTypeNone;\r
+}\r
+\r
+\r
+void NCVMemSegment::clear()\r
+{\r
+ begin.clear();\r
+ size = 0;\r
+}\r
+\r
+\r
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)\r
+{\r
+ NCVStatus ncvStat;\r
+ switch (dstType)\r
+ {\r
+ case NCVMemoryTypeHostPageable:\r
+ case NCVMemoryTypeHostPinned:\r
+ switch (srcType)\r
+ {\r
+ case NCVMemoryTypeHostPageable:\r
+ case NCVMemoryTypeHostPinned:\r
+ memcpy(dst, src, sz);\r
+ ncvStat = NCV_SUCCESS;\r
+ break;\r
+ case NCVMemoryTypeDevice:\r
+ if (cuStream != 0)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+ }\r
+ ncvStat = NCV_SUCCESS;\r
+ break;\r
+ default:\r
+ ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+ }\r
+ break;\r
+ case NCVMemoryTypeDevice:\r
+ switch (srcType)\r
+ {\r
+ case NCVMemoryTypeHostPageable:\r
+ case NCVMemoryTypeHostPinned:\r
+ if (cuStream != 0)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);\r
+ }\r
+ ncvStat = NCV_SUCCESS;\r
+ break;\r
+ case NCVMemoryTypeDevice:\r
+ if (cuStream != 0)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);\r
+ }\r
+ ncvStat = NCV_SUCCESS;\r
+ break;\r
+ default:\r
+ ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+ }\r
+ break;\r
+ default:\r
+ ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+ }\r
+\r
+ return ncvStat;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// NCVMemStackAllocator class members implementation\r
+//\r
+//===================================================================\r
+\r
+\r
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)\r
+ :\r
+ currentSize(0),\r
+ _maxSize(0),\r
+ allocBegin(NULL),\r
+ begin(NULL),\r
+ _memType(NCVMemoryTypeNone),\r
+ _alignment(alignment)\r
+{\r
+ NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+ ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");\r
+}\r
+\r
+\r
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)\r
+ :\r
+ currentSize(0),\r
+ _maxSize(0),\r
+ allocBegin(NULL),\r
+ _memType(memT),\r
+ _alignment(alignment)\r
+{\r
+ NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+ ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");\r
+\r
+ allocBegin = NULL;\r
+\r
+ switch (memT)\r
+ {\r
+ case NCVMemoryTypeDevice:\r
+ ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );\r
+ break;\r
+ case NCVMemoryTypeHostPinned:\r
+ ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );\r
+ break;\r
+ case NCVMemoryTypeHostPageable:\r
+ allocBegin = (Ncv8u *)malloc(capacity);\r
+ break;\r
+ }\r
+\r
+ if (capacity == 0)\r
+ {\r
+ allocBegin = (Ncv8u *)(0x1);\r
+ }\r
+\r
+ if (!isCounting())\r
+ {\r
+ begin = allocBegin;\r
+ end = begin + capacity;\r
+ }\r
+}\r
+\r
+\r
+NCVMemStackAllocator::~NCVMemStackAllocator()\r
+{\r
+ if (allocBegin != NULL)\r
+ {\r
+ ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");\r
+ switch (_memType)\r
+ {\r
+ case NCVMemoryTypeDevice:\r
+ ncvAssertCUDAReturn(cudaFree(allocBegin), );\r
+ break;\r
+ case NCVMemoryTypeHostPinned:\r
+ ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );\r
+ break;\r
+ case NCVMemoryTypeHostPageable:\r
+ free(allocBegin);\r
+ break;\r
+ }\r
+ allocBegin = NULL;\r
+ }\r
+}\r
+\r
+\r
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)\r
+{\r
+ seg.clear();\r
+ ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ size = alignUp(size, this->_alignment);\r
+ this->currentSize += size;\r
+ this->_maxSize = std::max(this->_maxSize, this->currentSize);\r
+\r
+ if (!isCounting())\r
+ {\r
+ size_t availSize = end - begin;\r
+ ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);\r
+ }\r
+\r
+ seg.begin.ptr = begin;\r
+ seg.begin.memtype = this->_memType;\r
+ seg.size = size;\r
+ begin += size;\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)\r
+{\r
+ ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);\r
+ ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);\r
+ ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);\r
+\r
+ currentSize -= seg.size;\r
+ begin -= seg.size;\r
+\r
+ seg.clear();\r
+\r
+ ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NcvBool NCVMemStackAllocator::isInitialized(void) const\r
+{\r
+ return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL;\r
+}\r
+\r
+\r
+NcvBool NCVMemStackAllocator::isCounting(void) const\r
+{\r
+ return this->_memType == NCVMemoryTypeNone;\r
+}\r
+\r
+\r
+NCVMemoryType NCVMemStackAllocator::memType(void) const\r
+{\r
+ return this->_memType;\r
+}\r
+\r
+\r
+Ncv32u NCVMemStackAllocator::alignment(void) const\r
+{\r
+ return this->_alignment;\r
+}\r
+\r
+\r
+size_t NCVMemStackAllocator::maxSize(void) const\r
+{\r
+ return this->_maxSize;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// NCVMemNativeAllocator class members implementation\r
+//\r
+//===================================================================\r
+\r
+\r
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)\r
+ :\r
+ currentSize(0),\r
+ _maxSize(0),\r
+ _memType(memT)\r
+{\r
+ ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );\r
+ ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );\r
+}\r
+\r
+\r
+NCVMemNativeAllocator::~NCVMemNativeAllocator()\r
+{\r
+ ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");\r
+}\r
+\r
+\r
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)\r
+{\r
+ seg.clear();\r
+ ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ switch (this->_memType)\r
+ {\r
+ case NCVMemoryTypeDevice:\r
+ ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);\r
+ break;\r
+ case NCVMemoryTypeHostPinned:\r
+ ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);\r
+ break;\r
+ case NCVMemoryTypeHostPageable:\r
+ seg.begin.ptr = (Ncv8u *)malloc(size);\r
+ break;\r
+ }\r
+\r
+ this->currentSize += alignUp(size, this->_alignment);\r
+ this->_maxSize = std::max(this->_maxSize, this->currentSize);\r
+\r
+ seg.begin.memtype = this->_memType;\r
+ seg.size = size;\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)\r
+{\r
+ ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);\r
+ ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);\r
+\r
+ ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);\r
+ currentSize -= alignUp(seg.size, this->_alignment);\r
+\r
+ switch (this->_memType)\r
+ {\r
+ case NCVMemoryTypeDevice:\r
+ ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);\r
+ break;\r
+ case NCVMemoryTypeHostPinned:\r
+ ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);\r
+ break;\r
+ case NCVMemoryTypeHostPageable:\r
+ free(seg.begin.ptr);\r
+ break;\r
+ }\r
+\r
+ seg.clear();\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NcvBool NCVMemNativeAllocator::isInitialized(void) const\r
+{\r
+ return (this->_alignment != 0);\r
+}\r
+\r
+\r
+NcvBool NCVMemNativeAllocator::isCounting(void) const\r
+{\r
+ return false;\r
+}\r
+\r
+\r
+NCVMemoryType NCVMemNativeAllocator::memType(void) const\r
+{\r
+ return this->_memType;\r
+}\r
+\r
+\r
+Ncv32u NCVMemNativeAllocator::alignment(void) const\r
+{\r
+ return this->_alignment;\r
+}\r
+\r
+\r
+size_t NCVMemNativeAllocator::maxSize(void) const\r
+{\r
+ return this->_maxSize;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// Time and timer routines\r
+//\r
+//===================================================================\r
+\r
+\r
+typedef struct _NcvTimeMoment NcvTimeMoment;\r
+\r
+#if defined(_WIN32) || defined(_WIN64)\r
+\r
+ #include <Windows.h>\r
+\r
+ typedef struct _NcvTimeMoment\r
+ {\r
+ LONGLONG moment, freq;\r
+ } NcvTimeMoment;\r
+\r
+\r
+ static void _ncvQueryMoment(NcvTimeMoment *t)\r
+ {\r
+ QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));\r
+ QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));\r
+ }\r
+\r
+\r
+ double _ncvMomentToMicroseconds(NcvTimeMoment *t)\r
+ {\r
+ return 1000000.0 * t->moment / t->freq;\r
+ }\r
+\r
+\r
+ double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+ {\r
+ return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);\r
+ }\r
+\r
+\r
+ double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+ {\r
+ return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);\r
+ }\r
+\r
+#elif defined(__unix__)\r
+\r
+ #include <sys/time.h>\r
+\r
+ typedef struct _NcvTimeMoment\r
+ {\r
+ struct timeval tv; \r
+ struct timezone tz;\r
+ } NcvTimeMoment;\r
+\r
+\r
+ void _ncvQueryMoment(NcvTimeMoment *t)\r
+ {\r
+ gettimeofday(& t->tv, & t->tz);\r
+ }\r
+\r
+\r
+ double _ncvMomentToMicroseconds(NcvTimeMoment *t)\r
+ {\r
+ return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;\r
+ }\r
+\r
+\r
+ double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+ {\r
+ return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);\r
+ }\r
+\r
+\r
+#endif //#if defined(_WIN32) || defined(_WIN64)\r
+\r
+\r
+struct _NcvTimer\r
+{\r
+ NcvTimeMoment t1, t2;\r
+};\r
+\r
+\r
+NcvTimer ncvStartTimer(void)\r
+{\r
+ struct _NcvTimer *t;\r
+ t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));\r
+ _ncvQueryMoment(&t->t1);\r
+ return t;\r
+}\r
+\r
+\r
+double ncvEndQueryTimerUs(NcvTimer t)\r
+{\r
+ double res;\r
+ _ncvQueryMoment(&t->t2);\r
+ res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);\r
+ free(t);\r
+ return res;\r
+}\r
+\r
+\r
+double ncvEndQueryTimerMs(NcvTimer t)\r
+{\r
+ double res;\r
+ _ncvQueryMoment(&t->t2);\r
+ res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);\r
+ free(t);\r
+ return res;\r
+}\r
+\r
+#endif /* !defined (HAVE_CUDA) */
\ No newline at end of file
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#ifndef _ncv_hpp_\r
+#define _ncv_hpp_\r
+\r
+#include <cuda_runtime.h>\r
+#include "npp_staging.h"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Alignment macros\r
+//\r
+//==============================================================================\r
+\r
+\r
+#if !defined(__align__) && !defined(__CUDACC__)\r
+ #if defined(_WIN32) || defined(_WIN64)\r
+ #define __align__(n) __declspec(align(n))\r
+ #elif defined(__unix__)\r
+ #define __align__(n) __attribute__((__aligned__(n)))\r
+ #endif\r
+#endif\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Integral and compound types of guaranteed size\r
+//\r
+//==============================================================================\r
+\r
+\r
+typedef bool NcvBool;\r
+typedef long long Ncv64s;\r
+typedef unsigned long long Ncv64u;\r
+typedef int Ncv32s;\r
+typedef unsigned int Ncv32u;\r
+typedef short Ncv16s;\r
+typedef unsigned short Ncv16u;\r
+typedef char Ncv8s;\r
+typedef unsigned char Ncv8u;\r
+typedef float Ncv32f;\r
+typedef double Ncv64f;\r
+\r
+\r
+typedef struct\r
+{\r
+ Ncv8u x;\r
+ Ncv8u y;\r
+ Ncv8u width;\r
+ Ncv8u height;\r
+} NcvRect8u;\r
+\r
+\r
+typedef struct\r
+{\r
+ Ncv32s x; ///< x-coordinate of upper left corner.\r
+ Ncv32s y; ///< y-coordinate of upper left corner.\r
+ Ncv32s width; ///< Rectangle width.\r
+ Ncv32s height; ///< Rectangle height.\r
+} NcvRect32s;\r
+\r
+\r
+typedef struct\r
+{\r
+ Ncv32u x; ///< x-coordinate of upper left corner.\r
+ Ncv32u y; ///< y-coordinate of upper left corner.\r
+ Ncv32u width; ///< Rectangle width.\r
+ Ncv32u height; ///< Rectangle height.\r
+} NcvRect32u;\r
+\r
+\r
+typedef struct \r
+{\r
+ Ncv32s width; ///< Rectangle width.\r
+ Ncv32s height; ///< Rectangle height.\r
+} NcvSize32s;\r
+\r
+\r
+typedef struct \r
+{\r
+ Ncv32u width; ///< Rectangle width.\r
+ Ncv32u height; ///< Rectangle height.\r
+} NcvSize32u;\r
+\r
+\r
+NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);\r
+NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);\r
+NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);\r
+NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);\r
+NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);\r
+NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);\r
+NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);\r
+NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));\r
+NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));\r
+NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));\r
+NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Persistent constants\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u K_WARP_SIZE = 32;\r
+const Ncv32u K_LOG2_WARP_SIZE = 5;\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Error handling\r
+//\r
+//==============================================================================\r
+\r
+\r
+#define NCV_CT_PREP_STRINGIZE_AUX(x) #x\r
+#define NCV_CT_PREP_STRINGIZE(x) NCV_CT_PREP_STRINGIZE_AUX(x)\r
+\r
+\r
+void ncvDebugOutput(const char *msg, ...);\r
+\r
+\r
+typedef void NCVDebugOutputHandler(const char* msg);\r
+\r
+\r
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);\r
+\r
+\r
+#define ncvAssertPrintCheck(pred, msg) \\r
+ ((pred) ? true : (ncvDebugOutput("\n%s\n", \\r
+ "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \\r
+ ), false))\r
+\r
+\r
+#define ncvAssertPrintReturn(pred, msg, err) \\r
+ if (ncvAssertPrintCheck(pred, msg)) ; else return err\r
+\r
+\r
+#define ncvAssertReturn(pred, err) \\r
+ do \\r
+ { \\r
+ if (!(pred)) \\r
+ { \\r
+ ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+ return err; \\r
+ } \\r
+ } while (0)\r
+\r
+\r
+#define ncvAssertReturnNcvStat(ncvOp) \\r
+ do \\r
+ { \\r
+ NCVStatus _ncvStat = ncvOp; \\r
+ if (NCV_SUCCESS != _ncvStat) \\r
+ { \\r
+ ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+ return _ncvStat; \\r
+ } \\r
+ } while (0)\r
+\r
+\r
+#define ncvAssertCUDAReturn(cudacall, errCode) \\r
+ do \\r
+ { \\r
+ cudaError_t resCall = cudacall; \\r
+ cudaError_t resGLE = cudaGetLastError(); \\r
+ if (cudaSuccess != resCall || cudaSuccess != resGLE) \\r
+ { \\r
+ ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+ return errCode; \\r
+ } \\r
+ } while (0)\r
+\r
+\r
+/**\r
+* Return-codes for status notification, errors and warnings\r
+*/\r
+enum NCVStatus\r
+{\r
+ NCV_SUCCESS,\r
+\r
+ NCV_CUDA_ERROR,\r
+ NCV_NPP_ERROR,\r
+ NCV_FILE_ERROR,\r
+\r
+ NCV_NULL_PTR,\r
+ NCV_INCONSISTENT_INPUT,\r
+ NCV_TEXTURE_BIND_ERROR,\r
+ NCV_DIMENSIONS_INVALID,\r
+\r
+ NCV_INVALID_ROI,\r
+ NCV_INVALID_STEP,\r
+ NCV_INVALID_SCALE,\r
+\r
+ NCV_ALLOCATOR_NOT_INITIALIZED,\r
+ NCV_ALLOCATOR_BAD_ALLOC,\r
+ NCV_ALLOCATOR_BAD_DEALLOC,\r
+ NCV_ALLOCATOR_INSUFFICIENT_CAPACITY,\r
+ NCV_ALLOCATOR_DEALLOC_ORDER,\r
+ NCV_ALLOCATOR_BAD_REUSE,\r
+\r
+ NCV_MEM_COPY_ERROR,\r
+ NCV_MEM_RESIDENCE_ERROR,\r
+ NCV_MEM_INSUFFICIENT_CAPACITY,\r
+\r
+ NCV_HAAR_INVALID_PIXEL_STEP,\r
+ NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER,\r
+ NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE,\r
+ NCV_HAAR_TOO_LARGE_FEATURES,\r
+ NCV_HAAR_XML_LOADING_EXCEPTION,\r
+\r
+ NCV_NOIMPL_HAAR_TILTED_FEATURES,\r
+\r
+ NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,\r
+};\r
+\r
+\r
+#define NCV_SET_SKIP_COND(x) \\r
+ bool __ncv_skip_cond = x\r
+\r
+\r
+#define NCV_RESET_SKIP_COND(x) \\r
+ __ncv_skip_cond = x\r
+\r
+\r
+#define NCV_SKIP_COND_BEGIN \\r
+ if (!__ncv_skip_cond) {\r
+\r
+\r
+#define NCV_SKIP_COND_END \\r
+ }\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Timer\r
+//\r
+//==============================================================================\r
+\r
+\r
+typedef struct _NcvTimer *NcvTimer;\r
+\r
+NcvTimer ncvStartTimer(void);\r
+\r
+double ncvEndQueryTimerUs(NcvTimer t);\r
+\r
+double ncvEndQueryTimerMs(NcvTimer t);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Memory management classes template compound types\r
+//\r
+//==============================================================================\r
+\r
+\r
+/**\r
+* Alignment of GPU memory chunks in bytes\r
+*/\r
+NCVStatus GPUAlignmentValue(Ncv32u &alignment);\r
+\r
+\r
+/**\r
+* Calculates the aligned top bound value\r
+*/\r
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment);\r
+\r
+\r
+/**\r
+* NCVMemoryType\r
+*/\r
+enum NCVMemoryType\r
+{\r
+ NCVMemoryTypeNone,\r
+ NCVMemoryTypeHostPageable,\r
+ NCVMemoryTypeHostPinned,\r
+ NCVMemoryTypeDevice\r
+};\r
+\r
+\r
+/**\r
+* NCVMemPtr\r
+*/\r
+struct NCVMemPtr\r
+{\r
+ void *ptr;\r
+ NCVMemoryType memtype;\r
+ void clear();\r
+};\r
+\r
+\r
+/**\r
+* NCVMemSegment\r
+*/\r
+struct NCVMemSegment\r
+{\r
+ NCVMemPtr begin;\r
+ size_t size;\r
+ void clear();\r
+};\r
+\r
+\r
+/**\r
+* INCVMemAllocator (Interface)\r
+*/\r
+class INCVMemAllocator\r
+{\r
+public:\r
+ virtual ~INCVMemAllocator() = 0;\r
+\r
+ virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0;\r
+ virtual NCVStatus dealloc(NCVMemSegment &seg) = 0;\r
+\r
+ virtual NcvBool isInitialized(void) const = 0;\r
+ virtual NcvBool isCounting(void) const = 0;\r
+ \r
+ virtual NCVMemoryType memType(void) const = 0;\r
+ virtual Ncv32u alignment(void) const = 0;\r
+ virtual size_t maxSize(void) const = 0;\r
+};\r
+\r
+inline INCVMemAllocator::~INCVMemAllocator() {}\r
+\r
+\r
+/**\r
+* NCVMemStackAllocator\r
+*/\r
+class NCVMemStackAllocator : public INCVMemAllocator\r
+{\r
+ NCVMemStackAllocator();\r
+ NCVMemStackAllocator(const NCVMemStackAllocator &);\r
+\r
+public:\r
+\r
+ explicit NCVMemStackAllocator(Ncv32u alignment);\r
+ NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);\r
+ virtual ~NCVMemStackAllocator();\r
+\r
+ virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);\r
+ virtual NCVStatus dealloc(NCVMemSegment &seg);\r
+\r
+ virtual NcvBool isInitialized(void) const;\r
+ virtual NcvBool isCounting(void) const;\r
+\r
+ virtual NCVMemoryType memType(void) const;\r
+ virtual Ncv32u alignment(void) const;\r
+ virtual size_t maxSize(void) const;\r
+\r
+private:\r
+\r
+ NCVMemoryType _memType;\r
+ Ncv32u _alignment;\r
+ Ncv8u *allocBegin;\r
+ Ncv8u *begin;\r
+ Ncv8u *end;\r
+ size_t currentSize;\r
+ size_t _maxSize;\r
+};\r
+\r
+\r
+/**\r
+* NCVMemNativeAllocator\r
+*/\r
+class NCVMemNativeAllocator : public INCVMemAllocator\r
+{\r
+public:\r
+\r
+ NCVMemNativeAllocator(NCVMemoryType memT);\r
+ virtual ~NCVMemNativeAllocator();\r
+\r
+ virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);\r
+ virtual NCVStatus dealloc(NCVMemSegment &seg);\r
+\r
+ virtual NcvBool isInitialized(void) const;\r
+ virtual NcvBool isCounting(void) const;\r
+\r
+ virtual NCVMemoryType memType(void) const;\r
+ virtual Ncv32u alignment(void) const;\r
+ virtual size_t maxSize(void) const;\r
+\r
+private:\r
+\r
+ NCVMemNativeAllocator();\r
+ NCVMemNativeAllocator(const NCVMemNativeAllocator &);\r
+\r
+ NCVMemoryType _memType;\r
+ Ncv32u _alignment;\r
+ size_t currentSize;\r
+ size_t _maxSize;\r
+};\r
+\r
+\r
+/**\r
+* Copy dispatcher\r
+*/\r
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,\r
+ const void *src, NCVMemoryType srcType,\r
+ size_t sz, cudaStream_t cuStream);\r
+\r
+\r
+/**\r
+* NCVVector (1D)\r
+*/\r
+template <class T>\r
+class NCVVector\r
+{\r
+ NCVVector(const NCVVector &);\r
+\r
+public:\r
+\r
+ NCVVector()\r
+ {\r
+ clear();\r
+ }\r
+\r
+ virtual ~NCVVector() {}\r
+\r
+ void clear()\r
+ {\r
+ _ptr = NULL;\r
+ _length = 0;\r
+ _memtype = NCVMemoryTypeNone;\r
+ }\r
+\r
+ NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)\r
+ {\r
+ if (howMuch == 0)\r
+ {\r
+ ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR);\r
+ howMuch = this->_length * sizeof(T);\r
+ }\r
+ else\r
+ {\r
+ ncvAssertReturn(dst._length * sizeof(T) >= howMuch && \r
+ this->_length * sizeof(T) >= howMuch &&\r
+ howMuch > 0, NCV_MEM_COPY_ERROR);\r
+ }\r
+ ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && \r
+ (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);\r
+\r
+ NCVStatus ncvStat = NCV_SUCCESS;\r
+ if (this->_memtype != NCVMemoryTypeNone)\r
+ {\r
+ ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,\r
+ this->_ptr, this->_memtype,\r
+ howMuch, cuStream);\r
+ }\r
+\r
+ return ncvStat;\r
+ }\r
+\r
+ T *ptr() const {return this->_ptr;}\r
+ size_t length() const {return this->_length;}\r
+ NCVMemoryType memType() const {return this->_memtype;}\r
+\r
+protected:\r
+\r
+ T *_ptr;\r
+ size_t _length;\r
+ NCVMemoryType _memtype;\r
+};\r
+\r
+\r
+/**\r
+* NCVVectorAlloc\r
+*/\r
+template <class T>\r
+class NCVVectorAlloc : public NCVVector<T>\r
+{\r
+ NCVVectorAlloc();\r
+ NCVVectorAlloc(const NCVVectorAlloc &);\r
+\r
+public:\r
+\r
+ NCVVectorAlloc(INCVMemAllocator &allocator, Ncv32u length)\r
+ :\r
+ allocator(allocator)\r
+ {\r
+ NCVStatus ncvStat;\r
+\r
+ this->clear();\r
+ this->allocatedMem.clear();\r
+\r
+ ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T));\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );\r
+\r
+ this->_ptr = (T *)this->allocatedMem.begin.ptr;\r
+ this->_length = length;\r
+ this->_memtype = this->allocatedMem.begin.memtype;\r
+ }\r
+\r
+\r
+ ~NCVVectorAlloc()\r
+ {\r
+ NCVStatus ncvStat;\r
+\r
+ ncvStat = allocator.dealloc(this->allocatedMem);\r
+ ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed");\r
+\r
+ this->clear();\r
+ }\r
+\r
+\r
+ NcvBool isMemAllocated() const\r
+ {\r
+ return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());\r
+ }\r
+\r
+\r
+ Ncv32u getAllocatorsAlignment() const\r
+ {\r
+ return allocator.alignment();\r
+ }\r
+\r
+\r
+ NCVMemSegment getSegment() const\r
+ {\r
+ return allocatedMem;\r
+ }\r
+\r
+private:\r
+\r
+ INCVMemAllocator &allocator;\r
+ NCVMemSegment allocatedMem;\r
+};\r
+\r
+\r
+/**\r
+* NCVVectorReuse\r
+*/\r
+template <class T>\r
+class NCVVectorReuse : public NCVVector<T>\r
+{\r
+ NCVVectorReuse();\r
+ NCVVectorReuse(const NCVVectorReuse &);\r
+\r
+public:\r
+\r
+ explicit NCVVectorReuse(const NCVMemSegment &memSegment)\r
+ {\r
+ this->bReused = false;\r
+ this->clear();\r
+\r
+ this->_length = memSegment.size / sizeof(T);\r
+ this->_ptr = (T *)memSegment.begin.ptr;\r
+ this->_memtype = memSegment.begin.memtype;\r
+\r
+ this->bReused = true;\r
+ }\r
+\r
+\r
+ NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)\r
+ {\r
+ this->bReused = false;\r
+ this->clear();\r
+\r
+ ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \\r
+ "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );\r
+\r
+ this->_length = length;\r
+ this->_ptr = (T *)memSegment.begin.ptr;\r
+ this->_memtype = memSegment.begin.memtype;\r
+\r
+ this->bReused = true;\r
+ }\r
+\r
+\r
+ NcvBool isMemReused() const\r
+ {\r
+ return this->bReused;\r
+ }\r
+\r
+private:\r
+\r
+ NcvBool bReused;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrix (2D)\r
+*/\r
+template <class T>\r
+class NCVMatrix\r
+{\r
+ NCVMatrix(const NCVMatrix &);\r
+\r
+public:\r
+\r
+ NCVMatrix()\r
+ {\r
+ clear();\r
+ }\r
+\r
+ virtual ~NCVMatrix() {}\r
+\r
+\r
+ void clear()\r
+ {\r
+ _ptr = NULL;\r
+ _pitch = 0;\r
+ _width = 0;\r
+ _height = 0;\r
+ _memtype = NCVMemoryTypeNone;\r
+ }\r
+\r
+\r
+ Ncv32u stride() const\r
+ {\r
+ return _pitch / sizeof(T);\r
+ }\r
+\r
+\r
+ NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)\r
+ {\r
+ if (howMuch == 0)\r
+ {\r
+ ncvAssertReturn(dst._pitch == this->_pitch &&\r
+ dst._height == this->_height, NCV_MEM_COPY_ERROR);\r
+ howMuch = this->_pitch * this->_height;\r
+ }\r
+ else\r
+ {\r
+ ncvAssertReturn(dst._pitch * dst._height >= howMuch && \r
+ this->_pitch * this->_height >= howMuch &&\r
+ howMuch > 0, NCV_MEM_COPY_ERROR);\r
+ }\r
+ ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && \r
+ (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);\r
+\r
+ NCVStatus ncvStat = NCV_SUCCESS;\r
+ if (this->_memtype != NCVMemoryTypeNone)\r
+ {\r
+ ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, \r
+ this->_ptr, this->_memtype, \r
+ howMuch, cuStream);\r
+ }\r
+\r
+ return ncvStat;\r
+ }\r
+\r
+ T *ptr() const {return this->_ptr;}\r
+ Ncv32u width() const {return this->_width;}\r
+ Ncv32u height() const {return this->_height;}\r
+ Ncv32u pitch() const {return this->_pitch;}\r
+ NCVMemoryType memType() const {return this->_memtype;}\r
+\r
+protected:\r
+\r
+ T *_ptr;\r
+ Ncv32u _width;\r
+ Ncv32u _height;\r
+ Ncv32u _pitch;\r
+ NCVMemoryType _memtype;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrixAlloc\r
+*/\r
+template <class T>\r
+class NCVMatrixAlloc : public NCVMatrix<T>\r
+{\r
+ NCVMatrixAlloc();\r
+ NCVMatrixAlloc(const NCVMatrixAlloc &);\r
+\r
+public:\r
+\r
+ NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)\r
+ :\r
+ allocator(allocator)\r
+ {\r
+ NCVStatus ncvStat;\r
+\r
+ this->clear();\r
+ this->allocatedMem.clear();\r
+\r
+ Ncv32u widthBytes = width * sizeof(T);\r
+ Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());\r
+\r
+ if (pitch != 0)\r
+ {\r
+ ncvAssertPrintReturn(pitch >= pitchBytes &&\r
+ (pitch & (allocator.alignment() - 1)) == 0,\r
+ "NCVMatrixAlloc ctor:: incorrect pitch passed", );\r
+ pitchBytes = pitch;\r
+ }\r
+\r
+ Ncv32u requiredAllocSize = pitchBytes * height;\r
+\r
+ ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);\r
+ ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );\r
+\r
+ this->_ptr = (T *)this->allocatedMem.begin.ptr;\r
+ this->_width = width;\r
+ this->_height = height;\r
+ this->_pitch = pitchBytes;\r
+ this->_memtype = this->allocatedMem.begin.memtype;\r
+ }\r
+\r
+ ~NCVMatrixAlloc()\r
+ {\r
+ NCVStatus ncvStat;\r
+\r
+ ncvStat = allocator.dealloc(this->allocatedMem);\r
+ ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed");\r
+\r
+ this->clear();\r
+ }\r
+\r
+\r
+ NcvBool isMemAllocated() const\r
+ {\r
+ return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());\r
+ }\r
+\r
+\r
+ Ncv32u getAllocatorsAlignment() const\r
+ {\r
+ return allocator.alignment();\r
+ }\r
+\r
+\r
+ NCVMemSegment getSegment() const\r
+ {\r
+ return allocatedMem;\r
+ }\r
+\r
+private:\r
+\r
+ INCVMemAllocator &allocator;\r
+ NCVMemSegment allocatedMem;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrixReuse\r
+*/\r
+template <class T>\r
+class NCVMatrixReuse : public NCVMatrix<T>\r
+{\r
+ NCVMatrixReuse();\r
+ NCVMatrixReuse(const NCVMatrixReuse &);\r
+\r
+public:\r
+\r
+ NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false)\r
+ {\r
+ this->bReused = false;\r
+ this->clear();\r
+\r
+ Ncv32u widthBytes = width * sizeof(T);\r
+ Ncv32u pitchBytes = alignUp(widthBytes, alignment);\r
+\r
+ if (pitch != 0)\r
+ {\r
+ if (!bSkipPitchCheck)\r
+ {\r
+ ncvAssertPrintReturn(pitch >= pitchBytes &&\r
+ (pitch & (alignment - 1)) == 0,\r
+ "NCVMatrixReuse ctor:: incorrect pitch passed", );\r
+ }\r
+ else\r
+ {\r
+ ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );\r
+ }\r
+ pitchBytes = pitch;\r
+ }\r
+\r
+ ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \\r
+ "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );\r
+\r
+ this->_width = width;\r
+ this->_height = height;\r
+ this->_pitch = pitchBytes;\r
+ this->_ptr = (T *)memSegment.begin.ptr;\r
+ this->_memtype = memSegment.begin.memtype;\r
+\r
+ this->bReused = true;\r
+ }\r
+\r
+\r
+ NcvBool isMemReused() const\r
+ {\r
+ return this->bReused;\r
+ }\r
+\r
+private:\r
+\r
+ NcvBool bReused;\r
+};\r
+\r
+#endif // _ncv_hpp_\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework\r
+//\r
+// The algorithm and code are explained in the upcoming GPU Computing Gems\r
+// chapter in detail:\r
+//\r
+// Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"\r
+// PDF URL placeholder\r
+// email: aobukhov@nvidia.com, devsupport@nvidia.com\r
+//\r
+// Credits for help with the code to:\r
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.\r
+//\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#include <algorithm>\r
+\r
+#include "npp.h"\r
+#include "NCV.hpp"\r
+#include "NCVRuntimeTemplates.hpp"\r
+#include "NCVHaarObjectDetection.hpp"\r
+\r
+void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// BlockScan file\r
+//\r
+//==============================================================================\r
+\r
+\r
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
+//assuming size <= WARP_SIZE and size is power of 2\r
+template <class T>\r
+inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
+{\r
+ Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
+ s_Data[pos] = 0;\r
+ pos += K_WARP_SIZE;\r
+ s_Data[pos] = idata;\r
+\r
+ for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+ {\r
+ s_Data[pos] += s_Data[pos - offset];\r
+ }\r
+\r
+ return s_Data[pos];\r
+}\r
+\r
+\r
+template <class T>\r
+inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
+{\r
+ return warpScanInclusive(idata, s_Data) - idata;\r
+}\r
+\r
+\r
+template <class T, Ncv32u tiNumScanThreads>\r
+inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
+{\r
+ if (tiNumScanThreads > K_WARP_SIZE)\r
+ {\r
+ //Bottom-level inclusive warp scan\r
+ T warpResult = warpScanInclusive(idata, s_Data);\r
+\r
+ //Save top elements of each warp for exclusive warp scan\r
+ //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
+ __syncthreads();\r
+ if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
+ {\r
+ s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
+ }\r
+\r
+ //wait for warp scans to complete\r
+ __syncthreads();\r
+\r
+ if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
+ {\r
+ //grab top warp elements\r
+ T val = s_Data[threadIdx.x];\r
+ //calculate exclusive scan and write back to shared memory\r
+ s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
+ }\r
+\r
+ //return updated warp scans with exclusive scan results\r
+ __syncthreads();\r
+ return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];\r
+ }\r
+ else\r
+ {\r
+ return warpScanInclusive(idata, s_Data);\r
+ }\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// HaarClassifierCascade file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u MAX_GRID_DIM = 65535;\r
+\r
+\r
+const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;\r
+\r
+\r
+#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6\r
+#define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)\r
+\r
+\r
+/** \internal\r
+* Haar features solid array.\r
+*/\r
+texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;\r
+\r
+\r
+/** \internal\r
+* Haar classifiers flattened trees container.\r
+* Two parts: first contains root nodes, second - nodes that are referred by root nodes.\r
+* Drawback: breaks tree locality (might cause more cache misses\r
+* Advantage: No need to introduce additional 32-bit field to index root nodes offsets\r
+*/\r
+texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;\r
+\r
+\r
+texture<Ncv32u, 1, cudaReadModeElementType> texIImage;\r
+\r
+\r
+__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)\r
+{\r
+ return d_Stages[iStage];\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureCascade>\r
+__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)\r
+{\r
+ HaarClassifierNode128 tmpNode;\r
+ if (tbCacheTextureCascade)\r
+ {\r
+ tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);\r
+ }\r
+ else\r
+ {\r
+ tmpNode = d_ClassifierNodes[iNode];\r
+ }\r
+ return tmpNode;\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureCascade>\r
+__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, \r
+ Ncv32f *weight,\r
+ Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)\r
+{\r
+ HaarFeature64 feature;\r
+ if (tbCacheTextureCascade)\r
+ {\r
+ feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);\r
+ }\r
+ else\r
+ {\r
+ feature = d_Features[iFeature];\r
+ }\r
+ feature.getRect(rectX, rectY, rectWidth, rectHeight);\r
+ *weight = feature.getWeight();\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureIImg>\r
+__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)\r
+{\r
+ if (tbCacheTextureIImg)\r
+ {\r
+ return tex1Dfetch(texIImage, x);\r
+ }\r
+ else\r
+ {\r
+ return d_IImg[x];\r
+ }\r
+}\r
+\r
+\r
+__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)\r
+{\r
+ __shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];\r
+ reductor[threadIdx.x] = partialSum;\r
+ __syncthreads();\r
+\r
+#if defined CPU_FP_COMPLIANCE\r
+ if (!threadIdx.x)\r
+ {\r
+ Ncv32f sum = 0.0f;\r
+ for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)\r
+ {\r
+ sum += reductor[i];\r
+ }\r
+ reductor[0] = sum;\r
+ }\r
+#else\r
+\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8\r
+ if (threadIdx.x < 128)\r
+ {\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 128]; \r
+ }\r
+ __syncthreads();\r
+#endif\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7\r
+ if (threadIdx.x < 64)\r
+ {\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 64]; \r
+ }\r
+ __syncthreads();\r
+#endif\r
+\r
+ if (threadIdx.x < 32)\r
+ {\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 32];\r
+#endif\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 16];\r
+#endif\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 8];\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 4];\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 2];\r
+ reductor[threadIdx.x] += reductor[threadIdx.x + 1];\r
+ }\r
+#endif\r
+\r
+ __syncthreads();\r
+\r
+ return reductor[0];\r
+}\r
+\r
+\r
+__device__ Ncv32u d_outMaskPosition;\r
+\r
+\r
+__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag,\r
+ Ncv32u threadElem,\r
+ Ncv32u *vectorOut)\r
+{\r
+#if __CUDA_ARCH__ >= 110\r
+ Ncv32u passMaskElem = threadPassFlag ? 1 : 0;\r
+ __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];\r
+ Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);\r
+ __syncthreads();\r
+ Ncv32u excScan = incScan - passMaskElem;\r
+\r
+ __shared__ Ncv32u numPassed;\r
+ __shared__ Ncv32u outMaskOffset;\r
+ if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)\r
+ {\r
+ numPassed = incScan;\r
+ outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);\r
+ }\r
+ __syncthreads();\r
+\r
+ if (threadPassFlag)\r
+ {\r
+ shmem[excScan] = threadElem;\r
+ }\r
+ __syncthreads();\r
+\r
+ if (threadIdx.x < numPassed)\r
+ {\r
+ vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];\r
+ }\r
+#endif\r
+}\r
+\r
+\r
+template <NcvBool tbInitMaskPositively,\r
+ NcvBool tbCacheTextureIImg,\r
+ NcvBool tbCacheTextureCascade,\r
+ NcvBool tbReadPixelIndexFromVector,\r
+ NcvBool tbDoAtomicCompaction>\r
+__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,\r
+ Ncv32f *d_weights, Ncv32u weightsStride,\r
+ HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+ Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+ Ncv32u y_offs;\r
+ Ncv32u x_offs;\r
+ Ncv32u maskOffset;\r
+ Ncv32u outMaskVal;\r
+\r
+ NcvBool bInactiveThread = false;\r
+\r
+ if (tbReadPixelIndexFromVector)\r
+ {\r
+ maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+\r
+ if (maskOffset >= mask1Dlen)\r
+ {\r
+ if (tbDoAtomicCompaction) bInactiveThread = true; else return;\r
+ }\r
+\r
+ if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+ {\r
+ outMaskVal = d_inMask[maskOffset];\r
+ y_offs = outMaskVal >> 16;\r
+ x_offs = outMaskVal & 0xFFFF;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ y_offs = blockIdx.y;\r
+ x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+\r
+ if (x_offs >= mask2Dstride)\r
+ {\r
+ if (tbDoAtomicCompaction) bInactiveThread = true; else return;\r
+ }\r
+\r
+ if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+ {\r
+ maskOffset = y_offs * mask2Dstride + x_offs;\r
+\r
+ if ((x_offs >= anchorsRoi.width) ||\r
+ (!tbInitMaskPositively &&\r
+ d_inMask != d_outMask &&\r
+ d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))\r
+ {\r
+ if (tbDoAtomicCompaction) \r
+ {\r
+ bInactiveThread = true; \r
+ }\r
+ else\r
+ {\r
+ d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ return;\r
+ }\r
+ }\r
+\r
+ outMaskVal = (y_offs << 16) | x_offs;\r
+ }\r
+ }\r
+\r
+ NcvBool bPass = true;\r
+\r
+ if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+ {\r
+ Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+\r
+ for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+ {\r
+ Ncv32f curStageSum = 0.0f;\r
+\r
+ HaarStage64 curStage = getStage(iStage, d_Stages);\r
+ Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();\r
+ Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();\r
+ Ncv32f stageThreshold = curStage.getStageThreshold();\r
+\r
+ while (numRootNodesInStage--)\r
+ {\r
+ NcvBool bMoreNodesToTraverse = true;\r
+ Ncv32u iNode = curRootNodeOffset;\r
+\r
+ while (bMoreNodesToTraverse)\r
+ {\r
+ HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+ HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+ Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+ Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
+\r
+ Ncv32f curNodeVal = 0.0f;\r
+\r
+ for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+ {\r
+ Ncv32f rectWeight;\r
+ Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+ getFeature<tbCacheTextureCascade>\r
+ (iFeature + iRect, d_Features,\r
+ &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+ Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+ Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+ Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+ Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+ Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+ curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+#else\r
+ curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+#endif\r
+ }\r
+\r
+ HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+ HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+ Ncv32f nodeThreshold = curNode.getThreshold();\r
+ HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+ nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;\r
+\r
+ if (nextNodeDescriptor.isLeaf())\r
+ {\r
+ Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+ curStageSum += tmpLeafValue;\r
+ bMoreNodesToTraverse = false;\r
+ }\r
+ else\r
+ {\r
+ iNode = nextNodeDescriptor.getNextNodeOffset();\r
+ }\r
+ }\r
+\r
+ __syncthreads();\r
+ curRootNodeOffset++;\r
+ }\r
+\r
+ if (curStageSum < stageThreshold)\r
+ {\r
+ bPass = false;\r
+ outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (!tbDoAtomicCompaction)\r
+ {\r
+ if (!tbReadPixelIndexFromVector ||\r
+ (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))\r
+ {\r
+ d_outMask[maskOffset] = outMaskVal;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,\r
+ outMaskVal,\r
+ d_outMask);\r
+ }\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureIImg,\r
+ NcvBool tbCacheTextureCascade,\r
+ NcvBool tbDoAtomicCompaction>\r
+__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,\r
+ Ncv32f *d_weights, Ncv32u weightsStride,\r
+ HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+ Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+ Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;\r
+\r
+ if (maskOffset >= mask1Dlen)\r
+ {\r
+ return;\r
+ }\r
+\r
+ Ncv32u outMaskVal = d_inMask[maskOffset];\r
+ Ncv32u y_offs = outMaskVal >> 16;\r
+ Ncv32u x_offs = outMaskVal & 0xFFFF;\r
+\r
+ Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+ NcvBool bPass = true;\r
+\r
+ for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+ {\r
+ //this variable is subject to reduction\r
+ Ncv32f curStageSum = 0.0f;\r
+\r
+ HaarStage64 curStage = getStage(iStage, d_Stages);\r
+ Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();\r
+ Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;\r
+ Ncv32f stageThreshold = curStage.getStageThreshold();\r
+\r
+ Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;\r
+\r
+ for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)\r
+ {\r
+ NcvBool bMoreNodesToTraverse = true;\r
+\r
+ if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)\r
+ {\r
+ Ncv32u iNode = curRootNodeOffset;\r
+\r
+ while (bMoreNodesToTraverse)\r
+ {\r
+ HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+ HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+ Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+ Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
+\r
+ Ncv32f curNodeVal = 0.0f;\r
+ //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce\r
+ for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+ {\r
+ Ncv32f rectWeight;\r
+ Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+ getFeature<tbCacheTextureCascade>\r
+ (iFeature + iRect, d_Features,\r
+ &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+ Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+ Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+ Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+ Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+ Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+ curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+#else\r
+ curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+#endif\r
+ }\r
+\r
+ HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+ HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+ Ncv32f nodeThreshold = curNode.getThreshold();\r
+ HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+ nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;\r
+\r
+ if (nextNodeDescriptor.isLeaf())\r
+ {\r
+ Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+ curStageSum += tmpLeafValue;\r
+ bMoreNodesToTraverse = false;\r
+ }\r
+ else\r
+ {\r
+ iNode = nextNodeDescriptor.getNextNodeOffset();\r
+ }\r
+ }\r
+ }\r
+ __syncthreads();\r
+\r
+ curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;\r
+ }\r
+\r
+ Ncv32f finalStageSum = reduceSpecialization(curStageSum);\r
+\r
+ if (finalStageSum < stageThreshold)\r
+ {\r
+ bPass = false;\r
+ outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ break;\r
+ }\r
+ }\r
+\r
+ if (!tbDoAtomicCompaction)\r
+ {\r
+ if (!bPass || d_inMask != d_outMask)\r
+ {\r
+ if (!threadIdx.x)\r
+ {\r
+ d_outMask[maskOffset] = outMaskVal;\r
+ }\r
+ }\r
+ }\r
+ else\r
+ {\r
+#if __CUDA_ARCH__ >= 110\r
+ if (bPass && !threadIdx.x)\r
+ {\r
+ Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);\r
+ d_outMask[outMaskOffset] = outMaskVal;\r
+ }\r
+#endif\r
+ }\r
+}\r
+\r
+\r
+template <NcvBool tbMaskByInmask,\r
+ NcvBool tbDoAtomicCompaction>\r
+__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u step)\r
+{\r
+ Ncv32u y_offs = blockIdx.y;\r
+ Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+ Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;\r
+\r
+ Ncv32u y_offs_upsc = step * y_offs;\r
+ Ncv32u x_offs_upsc = step * x_offs;\r
+ Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;\r
+\r
+ Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+\r
+ if (x_offs_upsc < anchorsRoi.width &&\r
+ (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))\r
+ {\r
+ outElem = (y_offs_upsc << 16) | x_offs_upsc;\r
+ }\r
+\r
+ if (!tbDoAtomicCompaction)\r
+ {\r
+ d_outMask[outMaskOffset] = outElem;\r
+ }\r
+ else\r
+ {\r
+ compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,\r
+ outElem,\r
+ d_outMask);\r
+ }\r
+}\r
+\r
+\r
+struct applyHaarClassifierAnchorParallelFunctor\r
+{\r
+ dim3 gridConf, blockConf;\r
+ cudaStream_t cuStream;\r
+\r
+ //Kernel arguments are stored as members;\r
+ Ncv32u *d_IImg;\r
+ Ncv32u IImgStride;\r
+ Ncv32f *d_weights;\r
+ Ncv32u weightsStride;\r
+ HaarFeature64 *d_Features;\r
+ HaarClassifierNode128 *d_ClassifierNodes;\r
+ HaarStage64 *d_Stages;\r
+ Ncv32u *d_inMask;\r
+ Ncv32u *d_outMask;\r
+ Ncv32u mask1Dlen;\r
+ Ncv32u mask2Dstride;\r
+ NcvSize32u anchorsRoi;\r
+ Ncv32u startStageInc;\r
+ Ncv32u endStageExc;\r
+ Ncv32f scaleArea;\r
+\r
+ //Arguments are passed through the constructor\r
+ applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+ Ncv32u *_d_IImg, Ncv32u _IImgStride,\r
+ Ncv32f *_d_weights, Ncv32u _weightsStride,\r
+ HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,\r
+ Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+ Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+ NcvSize32u _anchorsRoi, Ncv32u _startStageInc,\r
+ Ncv32u _endStageExc, Ncv32f _scaleArea) :\r
+ gridConf(_gridConf),\r
+ blockConf(_blockConf),\r
+ cuStream(_cuStream),\r
+ d_IImg(_d_IImg),\r
+ IImgStride(_IImgStride),\r
+ d_weights(_d_weights),\r
+ weightsStride(_weightsStride),\r
+ d_Features(_d_Features),\r
+ d_ClassifierNodes(_d_ClassifierNodes),\r
+ d_Stages(_d_Stages),\r
+ d_inMask(_d_inMask),\r
+ d_outMask(_d_outMask),\r
+ mask1Dlen(_mask1Dlen),\r
+ mask2Dstride(_mask2Dstride),\r
+ anchorsRoi(_anchorsRoi),\r
+ startStageInc(_startStageInc),\r
+ endStageExc(_endStageExc),\r
+ scaleArea(_scaleArea)\r
+ {}\r
+\r
+ template<class TList>\r
+ void call(TList tl)\r
+ {\r
+ applyHaarClassifierAnchorParallel <\r
+ Loki::TL::TypeAt<TList, 0>::Result::value,\r
+ Loki::TL::TypeAt<TList, 1>::Result::value,\r
+ Loki::TL::TypeAt<TList, 2>::Result::value,\r
+ Loki::TL::TypeAt<TList, 3>::Result::value,\r
+ Loki::TL::TypeAt<TList, 4>::Result::value >\r
+ <<<gridConf, blockConf, 0, cuStream>>>\r
+ (d_IImg, IImgStride,\r
+ d_weights, weightsStride,\r
+ d_Features, d_ClassifierNodes, d_Stages,\r
+ d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, startStageInc,\r
+ endStageExc, scaleArea);\r
+ }\r
+};\r
+\r
+\r
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,\r
+ NcvBool tbCacheTextureIImg,\r
+ NcvBool tbCacheTextureCascade,\r
+ NcvBool tbReadPixelIndexFromVector,\r
+ NcvBool tbDoAtomicCompaction,\r
+\r
+ dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+ Ncv32u *d_IImg, Ncv32u IImgStride,\r
+ Ncv32f *d_weights, Ncv32u weightsStride,\r
+ HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+ Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u startStageInc,\r
+ Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+ //Second parameter is the number of "dynamic" template parameters\r
+ NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>\r
+ ::call( applyHaarClassifierAnchorParallelFunctor(gridConf, blockConf, cuStream,\r
+ d_IImg, IImgStride,\r
+ d_weights, weightsStride,\r
+ d_Features, d_ClassifierNodes, d_Stages,\r
+ d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, startStageInc,\r
+ endStageExc, scaleArea),\r
+ 0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+ tbInitMaskPositively,\r
+ tbCacheTextureIImg,\r
+ tbCacheTextureCascade,\r
+ tbReadPixelIndexFromVector,\r
+ tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+struct applyHaarClassifierClassifierParallelFunctor\r
+{\r
+ dim3 gridConf, blockConf;\r
+ cudaStream_t cuStream;\r
+\r
+ //Kernel arguments are stored as members;\r
+ Ncv32u *d_IImg;\r
+ Ncv32u IImgStride;\r
+ Ncv32f *d_weights;\r
+ Ncv32u weightsStride;\r
+ HaarFeature64 *d_Features;\r
+ HaarClassifierNode128 *d_ClassifierNodes;\r
+ HaarStage64 *d_Stages;\r
+ Ncv32u *d_inMask;\r
+ Ncv32u *d_outMask;\r
+ Ncv32u mask1Dlen;\r
+ Ncv32u mask2Dstride;\r
+ NcvSize32u anchorsRoi;\r
+ Ncv32u startStageInc;\r
+ Ncv32u endStageExc;\r
+ Ncv32f scaleArea;\r
+\r
+ //Arguments are passed through the constructor\r
+ applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+ Ncv32u *_d_IImg, Ncv32u _IImgStride,\r
+ Ncv32f *_d_weights, Ncv32u _weightsStride,\r
+ HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,\r
+ Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+ Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+ NcvSize32u _anchorsRoi, Ncv32u _startStageInc,\r
+ Ncv32u _endStageExc, Ncv32f _scaleArea) :\r
+ gridConf(_gridConf),\r
+ blockConf(_blockConf),\r
+ cuStream(_cuStream),\r
+ d_IImg(_d_IImg),\r
+ IImgStride(_IImgStride),\r
+ d_weights(_d_weights),\r
+ weightsStride(_weightsStride),\r
+ d_Features(_d_Features),\r
+ d_ClassifierNodes(_d_ClassifierNodes),\r
+ d_Stages(_d_Stages),\r
+ d_inMask(_d_inMask),\r
+ d_outMask(_d_outMask),\r
+ mask1Dlen(_mask1Dlen),\r
+ mask2Dstride(_mask2Dstride),\r
+ anchorsRoi(_anchorsRoi),\r
+ startStageInc(_startStageInc),\r
+ endStageExc(_endStageExc),\r
+ scaleArea(_scaleArea)\r
+ {}\r
+\r
+ template<class TList>\r
+ void call(TList tl)\r
+ {\r
+ applyHaarClassifierClassifierParallel <\r
+ Loki::TL::TypeAt<TList, 0>::Result::value,\r
+ Loki::TL::TypeAt<TList, 1>::Result::value,\r
+ Loki::TL::TypeAt<TList, 2>::Result::value >\r
+ <<<gridConf, blockConf, 0, cuStream>>>\r
+ (d_IImg, IImgStride,\r
+ d_weights, weightsStride,\r
+ d_Features, d_ClassifierNodes, d_Stages,\r
+ d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, startStageInc,\r
+ endStageExc, scaleArea);\r
+ }\r
+};\r
+\r
+\r
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,\r
+ NcvBool tbCacheTextureCascade,\r
+ NcvBool tbDoAtomicCompaction,\r
+\r
+ dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+ Ncv32u *d_IImg, Ncv32u IImgStride,\r
+ Ncv32f *d_weights, Ncv32u weightsStride,\r
+ HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+ Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u startStageInc,\r
+ Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+ //Second parameter is the number of "dynamic" template parameters\r
+ NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>\r
+ ::call( applyHaarClassifierClassifierParallelFunctor(gridConf, blockConf, cuStream,\r
+ d_IImg, IImgStride,\r
+ d_weights, weightsStride,\r
+ d_Features, d_ClassifierNodes, d_Stages,\r
+ d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, startStageInc,\r
+ endStageExc, scaleArea),\r
+ 0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+ tbCacheTextureIImg,\r
+ tbCacheTextureCascade,\r
+ tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+struct initializeMaskVectorFunctor\r
+{\r
+ dim3 gridConf, blockConf;\r
+ cudaStream_t cuStream;\r
+\r
+ //Kernel arguments are stored as members;\r
+ Ncv32u *d_inMask;\r
+ Ncv32u *d_outMask;\r
+ Ncv32u mask1Dlen;\r
+ Ncv32u mask2Dstride;\r
+ NcvSize32u anchorsRoi;\r
+ Ncv32u step;\r
+\r
+ //Arguments are passed through the constructor\r
+ initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+ Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+ Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+ NcvSize32u _anchorsRoi, Ncv32u _step) :\r
+ gridConf(_gridConf),\r
+ blockConf(_blockConf),\r
+ cuStream(_cuStream),\r
+ d_inMask(_d_inMask),\r
+ d_outMask(_d_outMask),\r
+ mask1Dlen(_mask1Dlen),\r
+ mask2Dstride(_mask2Dstride),\r
+ anchorsRoi(_anchorsRoi),\r
+ step(_step)\r
+ {}\r
+\r
+ template<class TList>\r
+ void call(TList tl)\r
+ {\r
+ initializeMaskVector <\r
+ Loki::TL::TypeAt<TList, 0>::Result::value,\r
+ Loki::TL::TypeAt<TList, 1>::Result::value >\r
+ <<<gridConf, blockConf, 0, cuStream>>>\r
+ (d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, step);\r
+ }\r
+};\r
+\r
+\r
+void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,\r
+ NcvBool tbDoAtomicCompaction,\r
+\r
+ dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+ Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+ Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+ NcvSize32u anchorsRoi, Ncv32u step)\r
+{\r
+ //Second parameter is the number of "dynamic" template parameters\r
+ NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>\r
+ ::call( initializeMaskVectorFunctor(gridConf, blockConf, cuStream,\r
+ d_inMask, d_outMask,\r
+ mask1Dlen, mask2Dstride,\r
+ anchorsRoi, step),\r
+ 0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+ tbMaskByInmask,\r
+ tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages)\r
+{\r
+ Ncv32u i = 0;\r
+ for (; i<haar.NumStages; i++)\r
+ {\r
+ if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)\r
+ {\r
+ break;\r
+ }\r
+ }\r
+ return i;\r
+}\r
+\r
+\r
+template <class T>\r
+void swap(T &p1, T &p2)\r
+{\r
+ T tmp = p1;\r
+ p1 = p2;\r
+ p2 = tmp;\r
+}\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,\r
+ NCVMatrix<Ncv32f> &d_weights,\r
+ NCVMatrixAlloc<Ncv32u> &d_pixelMask,\r
+ Ncv32u &numDetections,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarStage64> &d_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+ NCVVector<HaarFeature64> &d_HaarFeatures,\r
+ NcvBool bMaskElements,\r
+ NcvSize32u anchorsRoi,\r
+ Ncv32u pixelStep,\r
+ Ncv32f scaleArea,\r
+ INCVMemAllocator &gpuAllocator,\r
+ INCVMemAllocator &cpuAllocator,\r
+ Ncv32u devPropMajor,\r
+ Ncv32u devPropMinor,\r
+ cudaStream_t cuStream)\r
+{\r
+ ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&\r
+ d_integralImage.memType() == d_pixelMask.memType() &&\r
+ d_integralImage.memType() == gpuAllocator.memType() &&\r
+ (d_integralImage.memType() == NCVMemoryTypeDevice ||\r
+ d_integralImage.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&\r
+ d_HaarStages.memType() == d_HaarFeatures.memType() &&\r
+ (d_HaarStages.memType() == NCVMemoryTypeDevice ||\r
+ d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);\r
+ ncvAssertReturn((d_integralImage.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&\r
+ h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&\r
+ d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);\r
+ ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&\r
+ d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&\r
+ d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&\r
+ d_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&\r
+ d_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);\r
+ ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&\r
+ d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+ d_HaarFeatures.length() >= haar.NumFeatures &&\r
+ d_HaarStages.length() == h_HaarStages.length() &&\r
+ haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+ ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+ NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+#if defined _SELF_TEST_\r
+\r
+ NCVStatus ncvStat;\r
+\r
+ NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, d_integralImage.width, d_integralImage.height, d_integralImage.pitch);\r
+ ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);\r
+ ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);\r
+ ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);\r
+ ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);\r
+ ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);\r
+ ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvStat = d_integralImage.copySolid(h_integralImage, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvStat = d_weights.copySolid(h_weights, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+ for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)\r
+ {\r
+ for (Ncv32u j=0; j<d_pixelMask.stride(); j++)\r
+ {\r
+ if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))\r
+ {\r
+ if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)\r
+ {\r
+ h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ }\r
+ }\r
+ }\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+#endif\r
+\r
+ NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());\r
+ ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);\r
+\r
+ NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, d_vecPixelMask.length());\r
+ ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);\r
+ ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ Ncv32u *hp_zero = &hp_pool32u.ptr()[0];\r
+ Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+ *hp_zero = 0;\r
+ *hp_numDet = 0;\r
+ NCV_SKIP_COND_END\r
+\r
+ Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *\r
+ (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));\r
+\r
+ NcvBool bTexCacheCascade = devPropMajor < 2;\r
+ NcvBool bTexCacheIImg = true; //this works better even on Fermi so far\r
+ NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3);\r
+\r
+ NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;\r
+ NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;\r
+\r
+ Ncv32u szNppCompactTmpBuf;\r
+ nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf);\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ szNppCompactTmpBuf = 0;\r
+ }\r
+ NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ if (bTexCacheIImg)\r
+ {\r
+ cudaChannelFormatDesc cfdTexIImage;\r
+ cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();\r
+\r
+ size_t alignmentOffset;\r
+ ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, d_integralImage.ptr(), cfdTexIImage,\r
+ (anchorsRoi.height + haar.ClassifierSize.height) * d_integralImage.pitch()), NCV_CUDA_ERROR);\r
+ ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+ }\r
+\r
+ if (bTexCacheCascade)\r
+ {\r
+ cudaChannelFormatDesc cfdTexHaarFeatures;\r
+ cudaChannelFormatDesc cfdTexHaarClassifierNodes;\r
+ cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();\r
+ cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();\r
+\r
+ size_t alignmentOffset;\r
+ ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,\r
+ d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);\r
+ ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+ ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,\r
+ d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);\r
+ ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+ }\r
+\r
+ Ncv32u stageStartAnchorParallel = 0;\r
+ Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,\r
+ haar, h_HaarStages);\r
+ Ncv32u stageEndClassifierParallel = haar.NumStages;\r
+ if (stageMiddleSwitch == 0)\r
+ {\r
+ stageMiddleSwitch = 1;\r
+ }\r
+\r
+ //create stages subdivision for pixel-parallel processing\r
+ const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;\r
+ Ncv32u curStop = stageStartAnchorParallel;\r
+ std::vector<Ncv32u> pixParallelStageStops;\r
+ while (curStop < stageMiddleSwitch)\r
+ {\r
+ pixParallelStageStops.push_back(curStop);\r
+ curStop += compactEveryNstage;\r
+ }\r
+ if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)\r
+ {\r
+ pixParallelStageStops[pixParallelStageStops.size()-1] = \r
+ (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;\r
+ }\r
+ pixParallelStageStops.push_back(stageMiddleSwitch);\r
+ Ncv32u pixParallelStageStopsIndex = 0;\r
+\r
+ if (pixelStep != 1 || bMaskElements)\r
+ {\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+ 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),\r
+ (anchorsRoi.height + pixelStep - 1) / pixelStep);\r
+ dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);\r
+\r
+ if (gridInit.x == 0 || gridInit.y == 0)\r
+ {\r
+ numDetections = 0;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ initializeMaskVectorDynTemplate(bMaskElements,\r
+ bDoAtomicCompaction,\r
+ gridInit, blockInit, cuStream,\r
+ d_ptrNowData->ptr(),\r
+ d_ptrNowTmp->ptr(),\r
+ d_vecPixelMask.length(), d_pixelMask.stride(),\r
+ anchorsRoi, pixelStep);\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+ 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ swap(d_ptrNowData, d_ptrNowTmp);\r
+ }\r
+ else\r
+ {\r
+ NppStStatus nppSt;\r
+ nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),\r
+ d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+ d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+ ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ }\r
+ numDetections = *hp_numDet;\r
+ }\r
+ else\r
+ {\r
+ //\r
+ // 1. Run the first pixel-input pixel-parallel classifier for few stages\r
+ //\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+ 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),\r
+ anchorsRoi.height);\r
+ dim3 block1(NUM_THREADS_ANCHORSPARALLEL);\r
+ applyHaarClassifierAnchorParallelDynTemplate(\r
+ true, //tbInitMaskPositively\r
+ bTexCacheIImg, //tbCacheTextureIImg\r
+ bTexCacheCascade, //tbCacheTextureCascade\r
+ pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector\r
+ bDoAtomicCompaction, //tbDoAtomicCompaction\r
+ grid1,\r
+ block1,\r
+ cuStream,\r
+ d_integralImage.ptr(), d_integralImage.stride(),\r
+ d_weights.ptr(), d_weights.stride(),\r
+ d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+ d_ptrNowData->ptr(),\r
+ bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+ 0,\r
+ d_pixelMask.stride(),\r
+ anchorsRoi,\r
+ pixParallelStageStops[pixParallelStageStopsIndex],\r
+ pixParallelStageStops[pixParallelStageStopsIndex+1],\r
+ scaleAreaPixels);\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+ 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ NppStStatus nppSt;\r
+ nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),\r
+ d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+ d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+ ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ }\r
+\r
+ swap(d_ptrNowData, d_ptrNowTmp);\r
+ numDetections = *hp_numDet;\r
+\r
+ pixParallelStageStopsIndex++;\r
+ }\r
+\r
+ //\r
+ // 2. Run pixel-parallel stages\r
+ //\r
+\r
+ for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)\r
+ {\r
+ if (numDetections == 0)\r
+ {\r
+ break;\r
+ }\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+ 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);\r
+ if (numDetections > MAX_GRID_DIM)\r
+ {\r
+ grid2.x = MAX_GRID_DIM;\r
+ grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;\r
+ }\r
+ dim3 block2(NUM_THREADS_ANCHORSPARALLEL);\r
+\r
+ applyHaarClassifierAnchorParallelDynTemplate(\r
+ false, //tbInitMaskPositively\r
+ bTexCacheIImg, //tbCacheTextureIImg\r
+ bTexCacheCascade, //tbCacheTextureCascade\r
+ pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector\r
+ bDoAtomicCompaction, //tbDoAtomicCompaction\r
+ grid2,\r
+ block2,\r
+ cuStream,\r
+ d_integralImage.ptr(), d_integralImage.stride(),\r
+ d_weights.ptr(), d_weights.stride(),\r
+ d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+ d_ptrNowData->ptr(),\r
+ bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+ numDetections,\r
+ d_pixelMask.stride(),\r
+ anchorsRoi,\r
+ pixParallelStageStops[pixParallelStageStopsIndex],\r
+ pixParallelStageStops[pixParallelStageStopsIndex+1],\r
+ scaleAreaPixels);\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+ 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ NppStStatus nppSt;\r
+ nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,\r
+ d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+ d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+ ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ }\r
+\r
+ swap(d_ptrNowData, d_ptrNowTmp);\r
+ numDetections = *hp_numDet;\r
+ }\r
+\r
+ //\r
+ // 3. Run all left stages in one stage-parallel kernel\r
+ //\r
+\r
+ if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)\r
+ {\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+ 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ dim3 grid3(numDetections);\r
+ if (numDetections > MAX_GRID_DIM)\r
+ {\r
+ grid3.x = MAX_GRID_DIM;\r
+ grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;\r
+ }\r
+ dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);\r
+\r
+ applyHaarClassifierClassifierParallelDynTemplate(\r
+ bTexCacheIImg, //tbCacheTextureIImg\r
+ bTexCacheCascade, //tbCacheTextureCascade\r
+ bDoAtomicCompaction, //tbDoAtomicCompaction\r
+ grid3,\r
+ block3,\r
+ cuStream,\r
+ d_integralImage.ptr(), d_integralImage.stride(),\r
+ d_weights.ptr(), d_weights.stride(),\r
+ d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+ d_ptrNowData->ptr(),\r
+ bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+ numDetections,\r
+ d_pixelMask.stride(),\r
+ anchorsRoi,\r
+ stageMiddleSwitch,\r
+ stageEndClassifierParallel,\r
+ scaleAreaPixels);\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+ 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+ else\r
+ {\r
+ NppStStatus nppSt;\r
+ nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,\r
+ d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+ d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+ ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ }\r
+\r
+ swap(d_ptrNowData, d_ptrNowTmp);\r
+ numDetections = *hp_numDet;\r
+ }\r
+\r
+ if (d_ptrNowData != &d_vecPixelMask)\r
+ {\r
+ d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+#if defined _SELF_TEST_\r
+\r
+ ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+\r
+ if (bDoAtomicCompaction)\r
+ {\r
+ std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);\r
+ }\r
+\r
+ Ncv32u fpu_oldcw, fpu_cw;\r
+ _controlfp_s(&fpu_cw, 0, 0);\r
+ fpu_oldcw = fpu_cw;\r
+ _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);\r
+ Ncv32u numDetGold;\r
+ ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,\r
+ h_HaarStages, h_HaarNodes, h_HaarFeatures,\r
+ bMaskElements, anchorsRoi, pixelStep, scaleArea);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);\r
+\r
+ bool bPass = true;\r
+\r
+ if (numDetGold != numDetections)\r
+ {\r
+ printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);\r
+ bPass = false;\r
+ }\r
+ else\r
+ {\r
+ for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)\r
+ {\r
+ if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])\r
+ {\r
+ printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);\r
+ bPass = false;\r
+ }\r
+ }\r
+ }\r
+\r
+ printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");\r
+#endif\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// HypothesesOperations file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u NUM_GROW_THREADS = 128;\r
+\r
+\r
+__device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)\r
+{\r
+ NcvRect32u res;\r
+ res.x = (Ncv32u)(scale * (pixel & 0xFFFF));\r
+ res.y = (Ncv32u)(scale * (pixel >> 16));\r
+ res.width = (Ncv32u)(scale * width);\r
+ res.height = (Ncv32u)(scale * height);\r
+ return res;\r
+}\r
+\r
+\r
+__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,\r
+ NcvRect32u *hypotheses,\r
+ Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)\r
+{\r
+ Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;\r
+ Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;\r
+ if (elemAddr >= numElements)\r
+ {\r
+ return;\r
+ }\r
+ hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);\r
+}\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,\r
+ Ncv32u numPixelMaskDetections,\r
+ NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &totalDetections,\r
+ Ncv32u totalMaxDetections,\r
+ Ncv32u rectWidth,\r
+ Ncv32u rectHeight,\r
+ Ncv32f curScale,\r
+ cudaStream_t cuStream)\r
+{\r
+ ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);\r
+ ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&\r
+ pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);\r
+ ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);\r
+ ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&\r
+ numPixelMaskDetections <= pixelMask.length() &&\r
+ totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);\r
+\r
+ NCVStatus ncvStat = NCV_SUCCESS;\r
+ Ncv32u numDetsToCopy = numPixelMaskDetections;\r
+\r
+ if (numDetsToCopy == 0)\r
+ {\r
+ return ncvStat;\r
+ }\r
+\r
+ if (totalDetections + numPixelMaskDetections > totalMaxDetections)\r
+ {\r
+ ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+ numDetsToCopy = totalMaxDetections - totalDetections;\r
+ }\r
+\r
+ dim3 block(NUM_GROW_THREADS);\r
+ dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);\r
+ if (grid.x > 65535)\r
+ {\r
+ grid.y = (grid.x + 65534) / 65535;\r
+ grid.x = 65535;\r
+ }\r
+ growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,\r
+ hypotheses.ptr() + totalDetections,\r
+ rectWidth, rectHeight, curScale);\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+ totalDetections += numDetsToCopy;\r
+ return ncvStat;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Visualize file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u NUMTHREADS_DRAWRECTS = 32;\r
+const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;\r
+\r
+\r
+template <class T>\r
+__global__ void drawRects(T *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ T color)\r
+{\r
+ Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;\r
+ if (blockId > numRects * 4)\r
+ {\r
+ return;\r
+ }\r
+\r
+ NcvRect32u curRect = d_rects[blockId >> 2];\r
+ NcvBool bVertical = blockId & 0x1;\r
+ NcvBool bTopLeft = blockId & 0x2;\r
+\r
+ Ncv32u pt0x, pt0y;\r
+ if (bVertical)\r
+ {\r
+ Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;\r
+\r
+ pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;\r
+ pt0y = curRect.y;\r
+\r
+ if (pt0x < dstWidth)\r
+ {\r
+ for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)\r
+ {\r
+ Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;\r
+ if (ptY < pt0y + curRect.height && ptY < dstHeight)\r
+ {\r
+ d_dst[ptY * dstStride + pt0x] = color;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ else\r
+ {\r
+ Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;\r
+\r
+ pt0x = curRect.x;\r
+ pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;\r
+\r
+ if (pt0y < dstHeight)\r
+ {\r
+ for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)\r
+ {\r
+ Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;\r
+ if (ptX < pt0x + curRect.width && ptX < dstWidth)\r
+ {\r
+ d_dst[pt0y * dstStride + ptX] = color;\r
+ }\r
+ }\r
+ }\r
+ }\r
+}\r
+\r
+\r
+template <class T>\r
+static NCVStatus drawRectsWrapperDevice(T *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ T color,\r
+ cudaStream_t cuStream)\r
+{\r
+ ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);\r
+ ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);\r
+ ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);\r
+\r
+ if (numRects == 0)\r
+ {\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+#if defined _SELF_TEST_\r
+ T *h_dst;\r
+ ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+ NcvRect32s *h_rects;\r
+ ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+ ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));\r
+#endif\r
+\r
+ dim3 grid(numRects * 4);\r
+ dim3 block(NUMTHREADS_DRAWRECTS);\r
+ if (grid.x > 65535)\r
+ {\r
+ grid.y = (grid.x + 65534) / 65535;\r
+ grid.x = 65535;\r
+ }\r
+\r
+ drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);\r
+\r
+ ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+#if defined _SELF_TEST_\r
+ T *h_dst_after;\r
+ ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+ bool bPass = true;\r
+ for (Ncv32u i=0; i<dstHeight && bPass; i++)\r
+ {\r
+ for (Ncv32u j=0; j<dstWidth && bPass; j++)\r
+ {\r
+ if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])\r
+ {\r
+ printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);\r
+ bPass = false;\r
+ }\r
+ }\r
+ }\r
+ ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);\r
+ ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);\r
+ printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");\r
+#endif\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ Ncv8u color,\r
+ cudaStream_t cuStream)\r
+{\r
+ return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ Ncv32u color,\r
+ cudaStream_t cuStream)\r
+{\r
+ return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Pipeline file\r
+//\r
+//==============================================================================\r
+\r
+\r
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,\r
+ NcvSize32u srcRoi,\r
+ NCVVector<NcvRect32u> &d_dstRects,\r
+ Ncv32u &dstNumRects,\r
+\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarStage64> &d_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+ NCVVector<HaarFeature64> &d_HaarFeatures,\r
+\r
+ NcvSize32u minObjSize,\r
+ Ncv32u minNeighbors, //default 4\r
+ Ncv32f scaleStep, //default 1.2f\r
+ Ncv32u pixelStep, //default 1\r
+ Ncv32u flags, //default NCVPipeObjDet_Default\r
+\r
+ INCVMemAllocator &gpuAllocator,\r
+ INCVMemAllocator &cpuAllocator,\r
+ Ncv32u devPropMajor,\r
+ Ncv32u devPropMinor,\r
+ cudaStream_t cuStream)\r
+{\r
+ ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&\r
+ d_srcImg.memType() == gpuAllocator.memType() &&\r
+ (d_srcImg.memType() == NCVMemoryTypeDevice ||\r
+ d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&\r
+ d_HaarStages.memType() == d_HaarFeatures.memType() &&\r
+ (d_HaarStages.memType() == NCVMemoryTypeDevice ||\r
+ d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);\r
+ ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&\r
+ h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&\r
+ d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);\r
+ ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&\r
+ d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&\r
+ srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&\r
+ d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);\r
+ ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&\r
+ d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+ d_HaarFeatures.length() >= haar.NumFeatures &&\r
+ d_HaarStages.length() == h_HaarStages.length() &&\r
+ haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+ ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+ //TODO: set NPP active stream to cuStream\r
+\r
+ NCVStatus ncvStat;\r
+ NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+ Ncv32u integralWidth = d_srcImg.width() + 1;\r
+ Ncv32u integralHeight = d_srcImg.height() + 1;\r
+\r
+ NCVMatrixAlloc<Ncv32u> d_integralImage(gpuAllocator, integralWidth, integralHeight);\r
+ ncvAssertReturn(d_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+ ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());\r
+ ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());\r
+ ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+ ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+ ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());\r
+ ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+ NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());\r
+ ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NppStStatus nppStat;\r
+ Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;\r
+ nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+ NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));\r
+ ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),\r
+ d_integralImage.ptr(), d_integralImage.pitch(),\r
+ NppStSize32u(d_srcImg.width(), d_srcImg.height()),\r
+ d_tmpIIbuf.ptr(), szTmpBufIntegral);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+ nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),\r
+ d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),\r
+ NppStSize32u(d_srcImg.width(), d_srcImg.height()),\r
+ d_tmpIIbuf.ptr(), szTmpBufSqIntegral);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ dstNumRects = 0;\r
+\r
+ Ncv32u lastCheckedScale = 0;\r
+ NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);\r
+ std::vector<Ncv32u> scalesVector;\r
+\r
+ NcvBool bFoundLargestFace = false;\r
+\r
+ for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)\r
+ {\r
+ Ncv32u scale = (Ncv32u)scaleIter;\r
+ if (lastCheckedScale == scale)\r
+ {\r
+ continue;\r
+ }\r
+ lastCheckedScale = scale;\r
+\r
+ if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||\r
+ haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)\r
+ {\r
+ continue;\r
+ }\r
+\r
+ NcvSize32s srcRoi, srcIIRoi, scaledIIRoi, searchRoi;\r
+\r
+ srcRoi.width = d_srcImg.width();\r
+ srcRoi.height = d_srcImg.height();\r
+\r
+ srcIIRoi.width = srcRoi.width + 1;\r
+ srcIIRoi.height = srcRoi.height + 1;\r
+\r
+ scaledIIRoi.width = srcIIRoi.width / scale;\r
+ scaledIIRoi.height = srcIIRoi.height / scale;\r
+\r
+ searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;\r
+ searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;\r
+\r
+ if (searchRoi.width <= 0 || searchRoi.height <= 0)\r
+ {\r
+ break;\r
+ }\r
+\r
+ scalesVector.push_back(scale);\r
+\r
+ if (gpuAllocator.isCounting())\r
+ {\r
+ break;\r
+ }\r
+ }\r
+\r
+ if (bReverseTraverseScale)\r
+ {\r
+ std::reverse(scalesVector.begin(), scalesVector.end());\r
+ }\r
+\r
+ //TODO: handle _fair_scale_ flag\r
+ for (Ncv32u i=0; i<scalesVector.size(); i++)\r
+ {\r
+ Ncv32u scale = scalesVector[i];\r
+\r
+ NcvSize32u srcRoi, scaledIIRoi, searchRoi;\r
+ NppStSize32u srcIIRoi;\r
+ srcRoi.width = d_srcImg.width();\r
+ srcRoi.height = d_srcImg.height();\r
+ srcIIRoi.width = srcRoi.width + 1;\r
+ srcIIRoi.height = srcRoi.height + 1;\r
+ scaledIIRoi.width = srcIIRoi.width / scale;\r
+ scaledIIRoi.height = srcIIRoi.height / scale;\r
+ searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;\r
+ searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ nppStat = nppiStDownsampleNearest_32u_C1R(\r
+ d_integralImage.ptr(), d_integralImage.pitch(),\r
+ d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),\r
+ srcIIRoi, scale, true);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+ nppStat = nppiStDownsampleNearest_64u_C1R(\r
+ d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),\r
+ d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),\r
+ srcIIRoi, scale, true);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+ const NppStRect32u rect(\r
+ HAAR_STDDEV_BORDER,\r
+ HAAR_STDDEV_BORDER,\r
+ haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,\r
+ haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);\r
+ nppStat = nppiStRectStdDev_32f_C1R(\r
+ d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),\r
+ d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),\r
+ d_rectStdDev.ptr(), d_rectStdDev.pitch(),\r
+ NppStSize32u(searchRoi.width, searchRoi.height), rect,\r
+ (Ncv32f)scale*scale, true);\r
+ ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ Ncv32u detectionsOnThisScale;\r
+ ncvStat = ncvApplyHaarClassifierCascade_device(\r
+ d_scaledIntegralImage, d_rectStdDev, d_pixelMask,\r
+ detectionsOnThisScale,\r
+ haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,\r
+ searchRoi, pixelStep, (Ncv32f)scale*scale,\r
+ gpuAllocator, cpuAllocator, devPropMajor, devPropMinor, cuStream);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());\r
+ ncvStat = ncvGrowDetectionsVector_device(\r
+ d_vecPixelMask,\r
+ detectionsOnThisScale,\r
+ d_hypothesesIntermediate,\r
+ dstNumRects,\r
+ d_hypothesesIntermediate.length(),\r
+ haar.ClassifierSize.width,\r
+ haar.ClassifierSize.height,\r
+ (Ncv32f)scale,\r
+ cuStream);\r
+ ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+\r
+ if (flags & NCVPipeObjDet_FindLargestObject)\r
+ {\r
+ if (dstNumRects == 0)\r
+ {\r
+ continue;\r
+ }\r
+\r
+ if (dstNumRects != 0)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,\r
+ dstNumRects * sizeof(NcvRect32u));\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ Ncv32u numStrongHypothesesNow = dstNumRects;\r
+ ncvStat = ncvFilterHypotheses_host(\r
+ h_hypothesesIntermediate,\r
+ numStrongHypothesesNow,\r
+ minNeighbors,\r
+ RECT_SIMILARITY_PROPORTION,\r
+ NULL);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+\r
+ if (numStrongHypothesesNow > 0)\r
+ {\r
+ NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];\r
+ for (Ncv32u j=1; j<numStrongHypothesesNow; j++)\r
+ {\r
+ if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)\r
+ {\r
+ maxRect = h_hypothesesIntermediate.ptr()[j];\r
+ }\r
+ }\r
+\r
+ h_hypothesesIntermediate.ptr()[0] = maxRect;\r
+ dstNumRects = 1;\r
+\r
+ ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+\r
+ bFoundLargestFace = true;\r
+\r
+ break;\r
+ }\r
+ }\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ if (gpuAllocator.isCounting())\r
+ {\r
+ break;\r
+ }\r
+ }\r
+\r
+ NCVStatus ncvRetCode = NCV_SUCCESS;\r
+\r
+ NCV_SKIP_COND_BEGIN\r
+\r
+ if (flags & NCVPipeObjDet_FindLargestObject)\r
+ {\r
+ if (!bFoundLargestFace)\r
+ {\r
+ dstNumRects = 0;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)\r
+ if (dstNumRects != 0)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,\r
+ dstNumRects * sizeof(NcvRect32u));\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ }\r
+\r
+ ncvStat = ncvFilterHypotheses_host(\r
+ h_hypothesesIntermediate,\r
+ dstNumRects,\r
+ minNeighbors,\r
+ RECT_SIMILARITY_PROPORTION,\r
+ NULL);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+\r
+ if (dstNumRects > d_dstRects.length())\r
+ {\r
+ ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+ dstNumRects = d_dstRects.length();\r
+ }\r
+\r
+ if (dstNumRects != 0)\r
+ {\r
+ ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,\r
+ dstNumRects * sizeof(NcvRect32u));\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ }\r
+ }\r
+\r
+ if (flags & NCVPipeObjDet_VisualizeInPlace)\r
+ {\r
+ ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+ ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),\r
+ d_srcImg.width(), d_srcImg.height(),\r
+ d_dstRects.ptr(), dstNumRects, 255, cuStream);\r
+ }\r
+\r
+ NCV_SKIP_COND_END\r
+\r
+ return ncvRetCode;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Purely Host code: classifier IO, mock-ups\r
+//\r
+//==============================================================================\r
+\r
+\r
+#ifdef _SELF_TEST_\r
+#include <float.h>\r
+#endif\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,\r
+ NCVMatrix<Ncv32f> &h_weights,\r
+ NCVMatrixAlloc<Ncv32u> &h_pixelMask,\r
+ Ncv32u &numDetections,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures,\r
+ NcvBool bMaskElements,\r
+ NcvSize32u anchorsRoi,\r
+ Ncv32u pixelStep,\r
+ Ncv32f scaleArea)\r
+{\r
+ ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&\r
+ h_integralImage.memType() == h_pixelMask.memType() &&\r
+ (h_integralImage.memType() == NCVMemoryTypeHostPageable ||\r
+ h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&\r
+ h_HaarStages.memType() == h_HaarFeatures.memType() &&\r
+ (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||\r
+ h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&\r
+ h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);\r
+ ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&\r
+ h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&\r
+ h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&\r
+ h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&\r
+ h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);\r
+ ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&\r
+ h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+ h_HaarFeatures.length() >= haar.NumFeatures &&\r
+ h_HaarStages.length() == h_HaarStages.length() &&\r
+ haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+ ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+ Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *\r
+ (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));\r
+\r
+ for (Ncv32u i=0; i<anchorsRoi.height; i++)\r
+ {\r
+ for (Ncv32u j=0; j<h_pixelMask.stride(); j++)\r
+ {\r
+ if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)\r
+ {\r
+ h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ }\r
+ else\r
+ {\r
+ for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)\r
+ {\r
+ Ncv32f curStageSum = 0.0f;\r
+ Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();\r
+ Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();\r
+\r
+ if (iStage == 0)\r
+ {\r
+ if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+ {\r
+ break;\r
+ }\r
+ else\r
+ {\r
+ h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);\r
+ }\r
+ }\r
+ else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+ {\r
+ break;\r
+ }\r
+\r
+ while (numRootNodesInStage--)\r
+ {\r
+ NcvBool bMoreNodesToTraverse = true;\r
+ Ncv32u curNodeOffset = curRootNodeOffset;\r
+\r
+ while (bMoreNodesToTraverse)\r
+ {\r
+ HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];\r
+ Ncv32u curNodeFeaturesNum = curNode.getFeatureDesc().getNumFeatures();\r
+ Ncv32u curNodeFeaturesOffs = curNode.getFeatureDesc().getFeaturesOffset();\r
+\r
+ Ncv32f curNodeVal = 0.f;\r
+ for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+ {\r
+ HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];\r
+ Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+ feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);\r
+ Ncv32f rectWeight = feature.getWeight();\r
+ Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);\r
+ Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+ Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();\r
+ Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+ Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];\r
+ Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];\r
+ Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];\r
+ Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];\r
+ Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;\r
+ curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+ }\r
+\r
+ HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+ HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+ Ncv32f nodeThreshold = curNode.getThreshold();\r
+ HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+\r
+ if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)\r
+ {\r
+ nextNodeDescriptor = nodeLeft;\r
+ }\r
+ else\r
+ {\r
+ nextNodeDescriptor = nodeRight;\r
+ }\r
+\r
+ NcvBool tmpIsLeaf = nextNodeDescriptor.isLeaf();\r
+ if (tmpIsLeaf)\r
+ {\r
+ Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();\r
+ curStageSum += tmpLeafValue;\r
+ bMoreNodesToTraverse = false;\r
+ }\r
+ else\r
+ {\r
+ curNodeOffset = nextNodeDescriptor.getNextNodeOffset();\r
+ }\r
+ }\r
+\r
+ curRootNodeOffset++;\r
+ }\r
+\r
+ Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();\r
+ if (curStageSum < tmpStageThreshold)\r
+ {\r
+ //drop\r
+ h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());\r
+ Ncv32u i = 0;\r
+ for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)\r
+ {\r
+ if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+ {\r
+ break;\r
+ }\r
+ }\r
+ numDetections = i;\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,\r
+ Ncv32u numPixelMaskDetections,\r
+ NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &totalDetections,\r
+ Ncv32u totalMaxDetections,\r
+ Ncv32u rectWidth,\r
+ Ncv32u rectHeight,\r
+ Ncv32f curScale)\r
+{\r
+ ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);\r
+ ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&\r
+ pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+ ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);\r
+ ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);\r
+ ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&\r
+ numPixelMaskDetections <= pixelMask.length() &&\r
+ totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);\r
+\r
+ NCVStatus ncvStat = NCV_SUCCESS;\r
+ Ncv32u numDetsToCopy = numPixelMaskDetections;\r
+\r
+ if (numDetsToCopy == 0)\r
+ {\r
+ return ncvStat;\r
+ }\r
+\r
+ if (totalDetections + numPixelMaskDetections > totalMaxDetections)\r
+ {\r
+ ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+ numDetsToCopy = totalMaxDetections - totalDetections;\r
+ }\r
+\r
+ for (Ncv32u i=0; i<numDetsToCopy; i++)\r
+ {\r
+ hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);\r
+ }\r
+\r
+ totalDetections += numDetsToCopy;\r
+ return ncvStat;\r
+}\r
+\r
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &numHypotheses,\r
+ Ncv32u minNeighbors,\r
+ Ncv32f intersectEps,\r
+ NCVVector<Ncv32u> *hypothesesWeights)\r
+{\r
+ ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||\r
+ hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+ if (hypothesesWeights != NULL)\r
+ {\r
+ ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||\r
+ hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+ }\r
+\r
+ if (numHypotheses == 0)\r
+ {\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ std::vector<NcvRect32u> rects(numHypotheses);\r
+ memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));\r
+\r
+ std::vector<Ncv32u> weights;\r
+ if (hypothesesWeights != NULL)\r
+ {\r
+ groupRectangles(rects, minNeighbors, intersectEps, &weights);\r
+ }\r
+ else\r
+ {\r
+ groupRectangles(rects, minNeighbors, intersectEps, NULL);\r
+ }\r
+\r
+ numHypotheses = (Ncv32u)rects.size();\r
+ if (numHypotheses > 0)\r
+ {\r
+ memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));\r
+ }\r
+\r
+ if (hypothesesWeights != NULL)\r
+ {\r
+ memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));\r
+ }\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+template <class T>\r
+static NCVStatus drawRectsWrapperHost(T *h_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *h_rects,\r
+ Ncv32u numRects,\r
+ T color)\r
+{\r
+ ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);\r
+ ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);\r
+ ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);\r
+ ncvAssertReturn(numRects != 0, NCV_SUCCESS);\r
+ ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);\r
+\r
+ for (Ncv32u i=0; i<numRects; i++)\r
+ {\r
+ NcvRect32u rect = h_rects[i];\r
+\r
+ if (rect.x < dstWidth)\r
+ {\r
+ for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)\r
+ {\r
+ h_dst[i*dstStride+rect.x] = color;\r
+ }\r
+ }\r
+ if (rect.x+rect.width-1 < dstWidth)\r
+ {\r
+ for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)\r
+ {\r
+ h_dst[i*dstStride+rect.x+rect.width-1] = color;\r
+ }\r
+ }\r
+ if (rect.y < dstHeight)\r
+ {\r
+ for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)\r
+ {\r
+ h_dst[rect.y*dstStride+j] = color;\r
+ }\r
+ }\r
+ if (rect.y + rect.height - 1 < dstHeight)\r
+ {\r
+ for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)\r
+ {\r
+ h_dst[(rect.y+rect.height-1)*dstStride+j] = color;\r
+ }\r
+ }\r
+ }\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *h_rects,\r
+ Ncv32u numRects,\r
+ Ncv8u color)\r
+{\r
+ return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *h_rects,\r
+ Ncv32u numRects,\r
+ Ncv32u color)\r
+{\r
+ return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);\r
+}\r
+\r
+\r
+NCVStatus loadFromXML(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ std::vector<HaarStage64> &haarStages,\r
+ std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+ std::vector<HaarFeature64> &haarFeatures);\r
+\r
+\r
+#define NVBIN_HAAR_SIZERESERVED 16\r
+#define NVBIN_HAAR_VERSION 0x1\r
+\r
+\r
+static NCVStatus loadFromNVBIN(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ std::vector<HaarStage64> &haarStages,\r
+ std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+ std::vector<HaarFeature64> &haarFeatures)\r
+{\r
+ FILE *fp;\r
+ fopen_s(&fp, filename.c_str(), "rb");\r
+ ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+ Ncv32u fileVersion;\r
+ fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
+ Ncv32u fsize;\r
+ fread_s(&fsize, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ fseek(fp, 0, SEEK_END);\r
+ Ncv32u fsizeActual = ftell(fp);\r
+ ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);\r
+\r
+ std::vector<unsigned char> fdata;\r
+ fdata.resize(fsize);\r
+ Ncv32u dataOffset = 0;\r
+ fseek(fp, 0, SEEK_SET);\r
+ fread_s(&fdata[0], fsize, fsize, 1, fp);\r
+ fclose(fp);\r
+\r
+ //data\r
+ dataOffset = NVBIN_HAAR_SIZERESERVED;\r
+ haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(Ncv32u);\r
+ haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(Ncv32u);\r
+ haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(Ncv32u);\r
+ haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(Ncv32u);\r
+ haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(NcvSize32u);\r
+ haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(NcvBool);\r
+ haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);\r
+ dataOffset += sizeof(NcvBool);\r
+\r
+ haarStages.resize(haar.NumStages);\r
+ haarClassifierNodes.resize(haar.NumClassifierTotalNodes);\r
+ haarFeatures.resize(haar.NumFeatures);\r
+\r
+ Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);\r
+ Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);\r
+ Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);\r
+\r
+ memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);\r
+ dataOffset += szStages;\r
+ memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);\r
+ dataOffset += szClassifiers;\r
+ memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);\r
+ dataOffset += szFeatures;\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,\r
+ Ncv32u &numNodes, Ncv32u &numFeatures)\r
+{\r
+ NCVStatus ncvStat;\r
+\r
+ std::string fext = filename.substr(filename.find_last_of(".") + 1);\r
+ std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);\r
+\r
+ if (fext == "nvbin")\r
+ {\r
+ FILE *fp;\r
+ fopen_s(&fp, filename.c_str(), "rb");\r
+ ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+ Ncv32u fileVersion;\r
+ fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
+ fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);\r
+ Ncv32u tmp;\r
+ fread_s(&numStages, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ fread_s(&tmp, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ fread_s(&numNodes, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ fread_s(&numFeatures, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+ fclose(fp);\r
+ }\r
+ else if (fext == "xml")\r
+ {\r
+ HaarClassifierCascadeDescriptor haar;\r
+ std::vector<HaarStage64> haarStages;\r
+ std::vector<HaarClassifierNode128> haarNodes;\r
+ std::vector<HaarFeature64> haarFeatures;\r
+\r
+ ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+\r
+ numStages = haar.NumStages;\r
+ numNodes = haar.NumClassifierTotalNodes;\r
+ numFeatures = haar.NumFeatures;\r
+ }\r
+ else\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures)\r
+{\r
+ ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&\r
+ h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&\r
+ h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+\r
+ NCVStatus ncvStat;\r
+\r
+ std::string fext = filename.substr(filename.find_last_of(".") + 1);\r
+ std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);\r
+\r
+ std::vector<HaarStage64> haarStages;\r
+ std::vector<HaarClassifierNode128> haarNodes;\r
+ std::vector<HaarFeature64> haarFeatures;\r
+\r
+ if (fext == "nvbin")\r
+ {\r
+ ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ }\r
+ else if (fext == "xml")\r
+ {\r
+ ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);\r
+ ncvAssertReturnNcvStat(ncvStat);\r
+ }\r
+ else\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+\r
+ ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+ ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+ ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+\r
+ memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));\r
+ memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));\r
+ memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));\r
+\r
+ return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarStoreNVBIN_host(std::string &filename,\r
+ HaarClassifierCascadeDescriptor haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures)\r
+{\r
+ ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);\r
+ ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);\r
+ ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);\r
+ ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&\r
+ h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&\r
+ h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+\r
+ Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);\r
+ Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);\r
+ Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);\r
+\r
+ Ncv32u dataOffset = 0;\r
+ std::vector<unsigned char> fdata;\r
+ fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);\r
+\r
+ //header\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;\r
+\r
+ //data\r
+ dataOffset = NVBIN_HAAR_SIZERESERVED;\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;\r
+ dataOffset += sizeof(Ncv32u);\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;\r
+ dataOffset += sizeof(Ncv32u);\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;\r
+ dataOffset += sizeof(Ncv32u);\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;\r
+ dataOffset += sizeof(Ncv32u);\r
+ *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;\r
+ dataOffset += sizeof(NcvSize32u);\r
+ *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;\r
+ dataOffset += sizeof(NcvBool);\r
+ *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;\r
+ dataOffset += sizeof(NcvBool);\r
+\r
+ memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);\r
+ dataOffset += szStages;\r
+ memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);\r
+ dataOffset += szClassifiers;\r
+ memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);\r
+ dataOffset += szFeatures;\r
+ Ncv32u fsize = dataOffset;\r
+\r
+ //TODO: CRC32 here\r
+\r
+ //update header\r
+ dataOffset = sizeof(Ncv32u);\r
+ *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;\r
+\r
+ FILE *fp;\r
+ fopen_s(&fp, filename.c_str(), "wb");\r
+ ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+ fwrite(&fdata[0], fsize, 1, fp);\r
+ fclose(fp);\r
+ return NCV_SUCCESS;\r
+}\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework\r
+//\r
+// The algorithm and code are explained in the upcoming GPU Computing Gems\r
+// chapter in detail:\r
+//\r
+// Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"\r
+// PDF URL placeholder\r
+// email: aobukhov@nvidia.com, devsupport@nvidia.com\r
+//\r
+// Credits for help with the code to:\r
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.\r
+//\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#ifndef _ncvhaarobjectdetection_hpp_\r
+#define _ncvhaarobjectdetection_hpp_\r
+\r
+#include <string>\r
+#include "NCV.hpp"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Guaranteed size cross-platform classifier structures\r
+//\r
+//==============================================================================\r
+\r
+\r
+struct HaarFeature64\r
+{\r
+ uint2 _ui2;\r
+\r
+#define HaarFeature64_CreateCheck_MaxRectField 0xFF\r
+\r
+ __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)\r
+ {\r
+ ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);\r
+ ((NcvRect8u*)&(this->_ui2.x))->x = rectX;\r
+ ((NcvRect8u*)&(this->_ui2.x))->y = rectY;\r
+ ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;\r
+ ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setWeight(Ncv32f weight)\r
+ {\r
+ ((Ncv32f*)&(this->_ui2.y))[0] = weight;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)\r
+ {\r
+ NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x);\r
+ *rectX = tmpRect.x;\r
+ *rectY = tmpRect.y;\r
+ *rectWidth = tmpRect.width;\r
+ *rectHeight = tmpRect.height;\r
+ }\r
+\r
+ __device__ __host__ Ncv32f getWeight(void)\r
+ {\r
+ return *(Ncv32f*)(&this->_ui2.y);\r
+ }\r
+};\r
+\r
+\r
+struct HaarFeatureDescriptor32\r
+{\r
+private:\r
+\r
+#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted 0x80000000\r
+#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures 0x7F\r
+#define HaarFeatureDescriptor32_NumFeatures_Shift 24\r
+#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset 0x00FFFFFF\r
+\r
+ Ncv32u desc;\r
+\r
+public:\r
+\r
+ __host__ NCVStatus create(NcvBool bTilted, Ncv32u numFeatures, Ncv32u offsetFeatures)\r
+ {\r
+ if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures)\r
+ {\r
+ return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER;\r
+ }\r
+ if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset)\r
+ {\r
+ return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE;\r
+ }\r
+ this->desc = 0;\r
+ this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0);\r
+ this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift);\r
+ this->desc |= offsetFeatures;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __device__ __host__ NcvBool isTilted(void)\r
+ {\r
+ return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0;\r
+ }\r
+\r
+ __device__ __host__ Ncv32u getNumFeatures(void)\r
+ {\r
+ return (this->desc & ~HaarFeatureDescriptor32_Interpret_MaskFlagTilted) >> HaarFeatureDescriptor32_NumFeatures_Shift;\r
+ }\r
+\r
+ __device__ __host__ Ncv32u getFeaturesOffset(void)\r
+ {\r
+ return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset;\r
+ }\r
+};\r
+\r
+\r
+struct HaarClassifierNodeDescriptor32\r
+{\r
+ uint1 _ui1;\r
+\r
+#define HaarClassifierNodeDescriptor32_Interpret_MaskSwitch (1 << 30)\r
+\r
+ __host__ NCVStatus create(Ncv32f leafValue)\r
+ {\r
+ if ((*(Ncv32u *)&leafValue) & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+ *(Ncv32f *)&this->_ui1 = leafValue;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode)\r
+ {\r
+ if (offsetHaarClassifierNode >= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+ this->_ui1.x = offsetHaarClassifierNode;\r
+ this->_ui1.x |= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __device__ __host__ NcvBool isLeaf(void)\r
+ {\r
+ return !(this->_ui1.x & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);\r
+ }\r
+\r
+ __host__ Ncv32f getLeafValueHost(void)\r
+ {\r
+ return *(Ncv32f *)&this->_ui1.x;\r
+ }\r
+\r
+#ifdef __CUDACC__\r
+ __device__ Ncv32f getLeafValue(void)\r
+ {\r
+ return __int_as_float(this->_ui1.x);\r
+ }\r
+#endif\r
+\r
+ __device__ __host__ Ncv32u getNextNodeOffset(void)\r
+ {\r
+ return (this->_ui1.x & ~HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);\r
+ }\r
+};\r
+\r
+\r
+struct HaarClassifierNode128\r
+{\r
+ uint4 _ui4;\r
+\r
+ __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f)\r
+ {\r
+ this->_ui4.x = *(Ncv32u *)&f;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setThreshold(Ncv32f t)\r
+ {\r
+ this->_ui4.y = *(Ncv32u *)&t;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)\r
+ {\r
+ this->_ui4.z = *(Ncv32u *)&nl;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)\r
+ {\r
+ this->_ui4.w = *(Ncv32u *)&nr;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void)\r
+ {\r
+ return *(HaarFeatureDescriptor32 *)&this->_ui4.x;\r
+ }\r
+\r
+ __host__ __device__ Ncv32f getThreshold(void)\r
+ {\r
+ return *(Ncv32f*)&this->_ui4.y;\r
+ }\r
+\r
+ __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)\r
+ {\r
+ return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z;\r
+ }\r
+\r
+ __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void)\r
+ {\r
+ return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w;\r
+ }\r
+};\r
+\r
+\r
+struct HaarStage64\r
+{\r
+#define HaarStage64_Interpret_MaskRootNodes 0x0000FFFF\r
+#define HaarStage64_Interpret_MaskRootNodeOffset 0xFFFF0000\r
+#define HaarStage64_Interpret_ShiftRootNodeOffset 16\r
+\r
+ uint2 _ui2;\r
+\r
+ __host__ NCVStatus setStageThreshold(Ncv32f t)\r
+ {\r
+ this->_ui2.x = *(Ncv32u *)&t;\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val)\r
+ {\r
+ if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset))\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+ this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val)\r
+ {\r
+ if (val > HaarStage64_Interpret_MaskRootNodes)\r
+ {\r
+ return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+ }\r
+ this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset);\r
+ return NCV_SUCCESS;\r
+ }\r
+\r
+ __host__ __device__ Ncv32f getStageThreshold(void)\r
+ {\r
+ return *(Ncv32f*)&this->_ui2.x;\r
+ }\r
+\r
+ __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)\r
+ {\r
+ return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset);\r
+ }\r
+\r
+ __host__ __device__ Ncv32u getNumClassifierRootNodes(void)\r
+ {\r
+ return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);\r
+ }\r
+};\r
+\r
+\r
+NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);\r
+NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);\r
+NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);\r
+NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);\r
+NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Classifier cascade descriptor\r
+//\r
+//==============================================================================\r
+\r
+\r
+struct HaarClassifierCascadeDescriptor\r
+{\r
+ Ncv32u NumStages;\r
+ Ncv32u NumClassifierRootNodes;\r
+ Ncv32u NumClassifierTotalNodes;\r
+ Ncv32u NumFeatures;\r
+ NcvSize32u ClassifierSize;\r
+ NcvBool bNeedsTiltedII;\r
+ NcvBool bHasStumpsOnly;\r
+};\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Functional interface\r
+//\r
+//==============================================================================\r
+\r
+\r
+enum\r
+{\r
+ NCVPipeObjDet_Default = 0x000,\r
+ NCVPipeObjDet_UseFairImageScaling = 0x001,\r
+ NCVPipeObjDet_FindLargestObject = 0x002,\r
+ NCVPipeObjDet_VisualizeInPlace = 0x004,\r
+};\r
+\r
+\r
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,\r
+ NcvSize32u srcRoi,\r
+ NCVVector<NcvRect32u> &d_dstRects,\r
+ Ncv32u &dstNumRects,\r
+\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarStage64> &d_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+ NCVVector<HaarFeature64> &d_HaarFeatures,\r
+\r
+ NcvSize32u minObjSize,\r
+ Ncv32u minNeighbors, //default 4\r
+ Ncv32f scaleStep, //default 1.2f\r
+ Ncv32u pixelStep, //default 1\r
+ Ncv32u flags, //default NCVPipeObjDet_Default\r
+\r
+ INCVMemAllocator &gpuAllocator,\r
+ INCVMemAllocator &cpuAllocator,\r
+ Ncv32u devPropMajor,\r
+ Ncv32u devPropMinor,\r
+ cudaStream_t cuStream);\r
+\r
+\r
+#define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF\r
+#define HAAR_STDDEV_BORDER 1\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,\r
+ NCVMatrix<Ncv32f> &d_weights,\r
+ NCVMatrixAlloc<Ncv32u> &d_pixelMask,\r
+ Ncv32u &numDetections,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarStage64> &d_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+ NCVVector<HaarFeature64> &d_HaarFeatures,\r
+ NcvBool bMaskElements,\r
+ NcvSize32u anchorsRoi,\r
+ Ncv32u pixelStep,\r
+ Ncv32f scaleArea,\r
+ INCVMemAllocator &gpuAllocator,\r
+ INCVMemAllocator &cpuAllocator,\r
+ Ncv32u devPropMajor,\r
+ Ncv32u devPropMinor,\r
+ cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,\r
+ NCVMatrix<Ncv32f> &h_weights,\r
+ NCVMatrixAlloc<Ncv32u> &h_pixelMask,\r
+ Ncv32u &numDetections,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures,\r
+ NcvBool bMaskElements,\r
+ NcvSize32u anchorsRoi,\r
+ Ncv32u pixelStep,\r
+ Ncv32f scaleArea);\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ Ncv8u color,\r
+ cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *d_rects,\r
+ Ncv32u numRects,\r
+ Ncv32u color,\r
+ cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *h_rects,\r
+ Ncv32u numRects,\r
+ Ncv8u color);\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,\r
+ Ncv32u dstStride,\r
+ Ncv32u dstWidth,\r
+ Ncv32u dstHeight,\r
+ NcvRect32u *h_rects,\r
+ Ncv32u numRects,\r
+ Ncv32u color);\r
+\r
+\r
+#define RECT_SIMILARITY_PROPORTION 0.2f\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,\r
+ Ncv32u numPixelMaskDetections,\r
+ NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &totalDetections,\r
+ Ncv32u totalMaxDetections,\r
+ Ncv32u rectWidth,\r
+ Ncv32u rectHeight,\r
+ Ncv32f curScale,\r
+ cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,\r
+ Ncv32u numPixelMaskDetections,\r
+ NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &totalDetections,\r
+ Ncv32u totalMaxDetections,\r
+ Ncv32u rectWidth,\r
+ Ncv32u rectHeight,\r
+ Ncv32f curScale);\r
+\r
+\r
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,\r
+ Ncv32u &numHypotheses,\r
+ Ncv32u minNeighbors,\r
+ Ncv32f intersectEps,\r
+ NCVVector<Ncv32u> *hypothesesWeights);\r
+\r
+\r
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,\r
+ Ncv32u &numNodes, Ncv32u &numFeatures);\r
+\r
+\r
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor &haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures);\r
+\r
+\r
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,\r
+ HaarClassifierCascadeDescriptor haar,\r
+ NCVVector<HaarStage64> &h_HaarStages,\r
+ NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+ NCVVector<HaarFeature64> &h_HaarFeatures);\r
+\r
+\r
+\r
+#endif // _ncvhaarobjectdetection_hpp_\r
--- /dev/null
+////////////////////////////////////////////////////////////////////////////////\r
+// The Loki Library\r
+// Copyright (c) 2001 by Andrei Alexandrescu\r
+// This code accompanies the book:\r
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design \r
+// Patterns Applied". Copyright (c) 2001. Addison-Wesley.\r
+// Permission to use, copy, modify, distribute and sell this software for any \r
+// purpose is hereby granted without fee, provided that the above copyright \r
+// notice appear in all copies and that both that copyright notice and this \r
+// permission notice appear in supporting documentation.\r
+// The author or Addison-Welsey Longman make no representations about the \r
+// suitability of this software for any purpose. It is provided "as is" \r
+// without express or implied warranty.\r
+// http://loki-lib.sourceforge.net/index.php?n=Main.License\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#ifndef _ncvruntimetemplates_hpp_\r
+#define _ncvruntimetemplates_hpp_\r
+\r
+#include <stdarg.h>\r
+#include <vector>\r
+\r
+\r
+namespace Loki\r
+{\r
+ //==============================================================================\r
+ // class NullType\r
+ // Used as a placeholder for "no type here"\r
+ // Useful as an end marker in typelists \r
+ //==============================================================================\r
+\r
+ class NullType {};\r
+\r
+ //==============================================================================\r
+ // class template Typelist\r
+ // The building block of typelists of any length\r
+ // Use it through the LOKI_TYPELIST_NN macros\r
+ // Defines nested types:\r
+ // Head (first element, a non-typelist type by convention)\r
+ // Tail (second element, can be another typelist)\r
+ //==============================================================================\r
+\r
+ template <class T, class U>\r
+ struct Typelist\r
+ {\r
+ typedef T Head;\r
+ typedef U Tail;\r
+ };\r
+\r
+ //==============================================================================\r
+ // class template Int2Type\r
+ // Converts each integral constant into a unique type\r
+ // Invocation: Int2Type<v> where v is a compile-time constant integral\r
+ // Defines 'value', an enum that evaluates to v\r
+ //==============================================================================\r
+\r
+ template <int v>\r
+ struct Int2Type\r
+ {\r
+ enum { value = v };\r
+ };\r
+\r
+ namespace TL\r
+ {\r
+ //==============================================================================\r
+ // class template TypeAt\r
+ // Finds the type at a given index in a typelist\r
+ // Invocation (TList is a typelist and index is a compile-time integral \r
+ // constant):\r
+ // TypeAt<TList, index>::Result\r
+ // returns the type in position 'index' in TList\r
+ // If you pass an out-of-bounds index, the result is a compile-time error\r
+ //==============================================================================\r
+\r
+ template <class TList, unsigned int index> struct TypeAt;\r
+\r
+ template <class Head, class Tail>\r
+ struct TypeAt<Typelist<Head, Tail>, 0>\r
+ {\r
+ typedef Head Result;\r
+ };\r
+\r
+ template <class Head, class Tail, unsigned int i>\r
+ struct TypeAt<Typelist<Head, Tail>, i>\r
+ {\r
+ typedef typename TypeAt<Tail, i - 1>::Result Result;\r
+ };\r
+ }\r
+}\r
+\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+// Runtime boolean template instance dispatcher\r
+// Cyril Crassin <cyril.crassin@icare3d.org>\r
+// NVIDIA, 2010\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+namespace NCVRuntimeTemplateBool\r
+{\r
+ //This struct is used to transform a list of parameters into template arguments\r
+ //The idea is to build a typelist containing the arguments\r
+ //and to pass this typelist to a user defined functor\r
+ template<typename TList, int NumArguments, class Func>\r
+ struct KernelCaller\r
+ {\r
+ //Convenience function used by the user\r
+ //Takes a variable argument list, transforms it into a list\r
+ static void call(Func &functor, int dummy, ...)\r
+ {\r
+ //Vector used to collect arguments\r
+ std::vector<int> templateParamList;\r
+\r
+ //Variable argument list manipulation\r
+ va_list listPointer;\r
+ va_start(listPointer, dummy);\r
+ //Collect parameters into the list\r
+ for(int i=0; i<NumArguments; i++)\r
+ {\r
+ int val = va_arg(listPointer, int);\r
+ templateParamList.push_back(val);\r
+ }\r
+ va_end(listPointer);\r
+\r
+ //Call the actual typelist building function\r
+ call(functor, templateParamList);\r
+ }\r
+\r
+ //Actual function called recursively to build a typelist based\r
+ //on a list of values\r
+ static void call( Func &functor, std::vector<int> &templateParamList)\r
+ {\r
+ //Get current parameter value in the list\r
+ int val = templateParamList[templateParamList.size() - 1];\r
+ templateParamList.pop_back();\r
+\r
+ //Select the compile time value to add into the typelist\r
+ //depending on the runtime variable and make recursive call. \r
+ //Both versions are really instantiated\r
+ if(val)\r
+ {\r
+ KernelCaller<\r
+ Loki::Typelist<typename Loki::Int2Type<true>, TList >,\r
+ NumArguments-1, Func >\r
+ ::call(functor, templateParamList);\r
+ }\r
+ else\r
+ {\r
+ KernelCaller< \r
+ Loki::Typelist<typename Loki::Int2Type<false>, TList >,\r
+ NumArguments-1, Func >\r
+ ::call(functor, templateParamList);\r
+ }\r
+ }\r
+ };\r
+\r
+ //Specialization for 0 value left in the list\r
+ //-> actual kernel functor call\r
+ template<class TList, class Func>\r
+ struct KernelCaller<TList, 0, Func>\r
+ {\r
+ static void call(Func &functor)\r
+ {\r
+ //Call to the functor's kernel call method\r
+ functor.call(TList()); //TList instantiated to get the method template parameter resolved\r
+ }\r
+\r
+ static void call(Func &functor, std::vector<int> &templateParams)\r
+ {\r
+ functor.call(TList());\r
+ }\r
+ };\r
+}\r
+\r
+#endif //_ncvruntimetemplates_hpp_\r
#include "npp_staging.h"\r
#include "surf_key_point.h"\r
\r
+ #include "nvidia/NCV.hpp"\r
+ #include "nvidia/NCVHaarObjectDetection.hpp"\r
+\r
#define CUDART_MINIMUM_REQUIRED_VERSION 3020\r
#define NPP_MINIMUM_REQUIRED_VERSION 3216\r
\r
--- /dev/null
+// WARNING: this sample is under construction! Use it on your own risk.\r
+\r
+#include <opencv2/contrib/contrib.hpp>\r
+#include <opencv2/objdetect/objdetect.hpp>\r
+#include <opencv2/highgui/highgui.hpp>\r
+#include <opencv2/imgproc/imgproc.hpp>\r
+#include <opencv2/gpu/gpu.hpp>\r
+\r
+#include <iostream>\r
+#include <iomanip>\r
+#include <stdio.h>\r
+\r
+using namespace std;\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+\r
+void help()\r
+{\r
+ cout << "Usage: ./cascadeclassifier <cascade_file> <image_or_video_or_cameraid>\n" \r
+ "Using OpenCV version " << CV_VERSION << endl << endl;\r
+}\r
+\r
+void DetectAndDraw(Mat& img, CascadeClassifier_GPU& cascade);\r
+\r
+String cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";\r
+String nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";\r
+\r
+\r
+\r
+template<class T> void convertAndReseize(const T& src, T& gray, T& resized, double scale = 2.0)\r
+{\r
+ if (src.channels() == 3)\r
+ cvtColor( src, gray, CV_BGR2GRAY );\r
+ else\r
+ gray = src;\r
+\r
+ Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));\r
+ if (scale != 1)\r
+ resize(gray, resized, sz);\r
+ else\r
+ resized = gray;\r
+}\r
+\r
+\r
+\r
+int main( int argc, const char** argv )\r
+{ \r
+ if (argc != 3)\r
+ return help(), -1;\r
+\r
+ if (cv::gpu::getCudaEnabledDeviceCount() == 0)\r
+ return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;\r
+\r
+ VideoCapture capture;\r
+ \r
+ string cascadeName = argv[1];\r
+ string inputName = argv[2];\r
+\r
+ cv::gpu::CascadeClassifier_GPU cascade_gpu;\r
+ if( !cascade_gpu.load( cascadeName ) )\r
+ return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;\r
+\r
+ cv::CascadeClassifier cascade_cpu;\r
+ if( !cascade_cpu.load( cascadeName ) )\r
+ return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;\r
+ \r
+ Mat image = imread( inputName);\r
+ if( image.empty() )\r
+ if (!capture.open(inputName))\r
+ {\r
+ int camid = 0;\r
+ sscanf(inputName.c_str(), "%d", &camid);\r
+ if(!capture.open(camid))\r
+ cout << "Can't open source" << endl;\r
+ }\r
+ \r
+ namedWindow( "result", 1 ); \r
+ Size fontSz = cv::getTextSize("T[]", FONT_HERSHEY_SIMPLEX, 1.0, 2, 0);\r
+\r
+ Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;\r
+ vector<Rect> facesBuf_cpu;\r
+\r
+ GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu; \r
+ \r
+ /* parameters */\r
+ bool useGPU = true;\r
+ double scale_factor = 2;\r
+\r
+ bool visualizeInPlace = false; \r
+ bool findLargestObject = false; \r
+\r
+ printf("\t<space> - toggle GPU/CPU\n");\r
+ printf("\tL - toggle lagest faces\n");\r
+ printf("\tV - toggle visualisation in-place (for GPU only)\n");\r
+ printf("\t1/q - inc/dec scale\n");\r
+ \r
+ int detections_num;\r
+ for(;;)\r
+ { \r
+ if( capture.isOpened() )\r
+ {\r
+ capture >> frame; \r
+ if( frame.empty())\r
+ break;\r
+ }\r
+\r
+ (image.empty() ? frame : image).copyTo(frame_cpu);\r
+ frame_gpu.upload( image.empty() ? frame : image);\r
+ \r
+ convertAndReseize(frame_gpu, gray_gpu, resized_gpu, scale_factor);\r
+ convertAndReseize(frame_cpu, gray_cpu, resized_cpu, scale_factor);\r
+\r
+ cv::TickMeter tm;\r
+ tm.start(); \r
+\r
+ if (useGPU)\r
+ {\r
+ cascade_gpu.visualizeInPlace = visualizeInPlace; \r
+ cascade_gpu.findLargestObject = findLargestObject; \r
+\r
+ detections_num = cascade_gpu.detectMultiScale( resized_gpu, facesBuf_gpu ); \r
+ facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);\r
+ \r
+ }\r
+ else /* so use CPU */\r
+ { \r
+ Size minSize = cascade_gpu.getClassifierSize();\r
+ if (findLargestObject)\r
+ { \r
+ float ratio = (float)std::min(frame.cols / minSize.width, frame.rows / minSize.height);\r
+ ratio = std::max(ratio / 2.5f, 1.f);\r
+ minSize = Size(cvRound(minSize.width * ratio), cvRound(minSize.height * ratio)); \r
+ }\r
+ \r
+ cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2, 4, (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, minSize); \r
+ detections_num = (int)facesBuf_cpu.size();\r
+ }\r
+\r
+ tm.stop();\r
+ printf( "detection time = %g ms\n", tm.getTimeMilli() );\r
+\r
+ if (useGPU)\r
+ resized_gpu.download(resized_cpu);\r
+\r
+ if (!visualizeInPlace || !useGPU)\r
+ if (detections_num)\r
+ {\r
+ Rect* faces = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0]; \r
+ for(int i = 0; i < detections_num; ++i) \r
+ cv::rectangle(resized_cpu, faces[i], Scalar(255)); \r
+ }\r
+ \r
+ Point text_pos(5, 25);\r
+ int offs = fontSz.height + 5;\r
+ Scalar color = CV_RGB(255, 0, 0);\r
+\r
+\r
+ cv::cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);\r
+\r
+ char buf[4096];\r
+ sprintf(buf, "%s, FPS = %0.3g", useGPU ? "GPU" : "CPU", 1.0/tm.getTimeSec()); \r
+ putText(frameDisp, buf, text_pos, FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+ sprintf(buf, "scale = %0.3g, [%d*scale x %d*scale]", scale_factor, frame.cols, frame.rows); \r
+ putText(frameDisp, buf, text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+ putText(frameDisp, "Hotkeys: space, 1, Q, L, V, Esc", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+ if (findLargestObject)\r
+ putText(frameDisp, "FindLargestObject", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+ if (visualizeInPlace && useGPU)\r
+ putText(frameDisp, "VisualizeInPlace", text_pos+Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+ cv::imshow( "result", frameDisp);\r
+\r
+ int key = waitKey( 5 );\r
+ if( key == 27)\r
+ break;\r
+\r
+ switch (key)\r
+ {\r
+ case (int)' ': useGPU = !useGPU; printf("Using %s\n", useGPU ? "GPU" : "CPU");break;\r
+ case (int)'v': case (int)'V': visualizeInPlace = !visualizeInPlace; printf("VisualizeInPlace = %d\n", visualizeInPlace); break;\r
+ case (int)'l': case (int)'L': findLargestObject = !findLargestObject; printf("FindLargestObject = %d\n", findLargestObject); break;\r
+ case (int)'1': scale_factor*=1.05; printf("Scale factor = %g\n", scale_factor); break;\r
+ case (int)'q': case (int)'Q':scale_factor/=1.05; printf("Scale factor = %g\n", scale_factor); break;\r
+ }\r
+ \r
+ } \r
+ return 0;\r
+}\r
+\r
+\r
+\r