First version of CascadeClassifier_GPU.
authorAnatoly Baksheev <no@email>
Thu, 13 Jan 2011 13:04:00 +0000 (13:04 +0000)
committerAnatoly Baksheev <no@email>
Thu, 13 Jan 2011 13:04:00 +0000 (13:04 +0000)
Only for VS2008 now.
Sample for it.
new NPP_staging for VS2008 only

17 files changed:
3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
3rdparty/NPP_staging/npp_staging.h
modules/gpu/CMakeLists.txt
modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/arithm.cpp
modules/gpu/src/cascadeclassifier.cpp
modules/gpu/src/cuda/internal_shared.hpp
modules/gpu/src/cuda/surf_key_point.h
modules/gpu/src/element_operations.cpp
modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample [new file with mode: 0644]
modules/gpu/src/nvidia/NCV.cpp [new file with mode: 0644]
modules/gpu/src/nvidia/NCV.hpp [new file with mode: 0644]
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu [new file with mode: 0644]
modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp [new file with mode: 0644]
modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp [new file with mode: 0644]
modules/gpu/src/precomp.hpp
samples/gpu/cascadeclassifier.cpp [new file with mode: 0644]

index 39d29b8..98d5c22 100644 (file)
Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib and b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib differ
index c54af5c..a9c6607 100644 (file)
@@ -188,14 +188,14 @@ struct NppStSize32u
 enum NppStStatus\r
 {\r
     //already present in NPP\r
/*   NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)\r
-    NPP_ERROR                        = -1,  ///< Unknown error\r
-    NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error\r
-    NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error\r
-    NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned\r
-    NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error\r
-    NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error\r
-    NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error*/\r
   //NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)\r
+    //NPP_ERROR                        = -1,  ///< Unknown error\r
+    //NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error\r
+    //NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error\r
+    //NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned\r
+    //NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error\r
+    //NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error\r
+    //NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error\r
 \r
     //to be added\r
     NPP_INVALID_ROI,                        ///< Invalid region of interest argument\r
@@ -244,7 +244,7 @@ extern "C" {
 \r
 /** \defgroup core_npp NPP Core\r
  * Basic functions for CUDA streams management.\r
- * WARNING: These functions couldn't be exported from NPP_staging library, so they can't be used\r
+ * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging\r
  * @{\r
  */\r
 \r
@@ -570,6 +570,13 @@ NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize
 \r
 \r
 /**\r
+ * Calculates the size of the temporary buffer for integral image creation\r
+ * \see nppiStIntegralGetSize_8u32u\r
+ */\r
+NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize);\r
+\r
+\r
+/**\r
  * Creates an integral image representation for the input image\r
  *\r
  * \param d_src             [IN] Source image pointer (CUDA device memory)\r
@@ -588,6 +595,15 @@ NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep,
 \r
 \r
 /**\r
+ * Creates an integral image representation for the input image\r
+ * \see nppiStIntegral_8u32u_C1R\r
+ */\r
+NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep,\r
+                                      NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize,\r
+                                      NppSt8u *pBuffer, NppSt32u bufSize);\r
+\r
+\r
+/**\r
  * Creates an integral image representation for the input image. Host implementation\r
  *\r
  * \param h_src             [IN] Source image pointer (Host or pinned memory)\r
@@ -603,6 +619,14 @@ NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep,
 \r
 \r
 /**\r
+ * Creates an integral image representation for the input image. Host implementation\r
+ * \see nppiStIntegral_8u32u_C1R_host\r
+ */\r
+NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,\r
+                                           NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize);\r
+\r
+\r
+/**\r
  * Calculates the size of the temporary buffer for squared integral image creation\r
  *\r
  * \param roiSize           [IN] Size of the input image\r
index 3c10b0e..f25036a 100644 (file)
@@ -35,6 +35,13 @@ source_group("Include" FILES ${lib_hdrs})
 file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
 
+if (HAVE_CUDA AND MSVC)
+    file(GLOB ncv_srcs "src/nvidia/*.cpp")
+       file(GLOB ncv_hdrs "src/nvidia/*.h*")
+       file(GLOB ncv_cuda "src/nvidia/*.cu")
+       source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+endif()
+
 if (HAVE_CUDA)         
        get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp}) 
@@ -68,19 +75,16 @@ if (HAVE_CUDA)
                string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
                string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
        endif()
+   
 
-       CUDA_COMPILE(cuda_objs ${lib_cuda})
+    include(FindNPP_staging.cmake)
+    include_directories(${NPPST_INC})   
+    
+       CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
        #CUDA_BUILD_CLEAN_TARGET()
 endif()
 
-
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs})
-
-IF (HAVE_CUDA)
-    include(FindNPP_staging.cmake)    
-    include_directories(${NPPST_INC})       
-    target_link_libraries(${the_target} ${NPPST_LIB})
-endif()
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
 
 if(PCHSupport_FOUND)
        set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
@@ -114,6 +118,7 @@ target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
 
 if (HAVE_CUDA)
        target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+    target_link_libraries(${the_target} ${NPPST_LIB})
     CUDA_ADD_CUFFT_TO_TARGET(${the_target})
 endif()
 
index 70a34ee..3979d62 100644 (file)
@@ -1380,87 +1380,39 @@ namespace cv
             explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}\r
         };\r
 \r
-        ////////////////////////////////// CascadeClassifier //////////////////////////////////////////\r
+        ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////\r
         // The cascade classifier class for object detection.\r
-        class CV_EXPORTS CascadeClassifier\r
+        class CV_EXPORTS CascadeClassifier_GPU\r
         {\r
-        public:\r
-            struct CV_EXPORTS DTreeNode\r
-            {\r
-                int featureIdx;\r
-                float threshold; // for ordered features only\r
-                int left;\r
-                int right;\r
-            };\r
-\r
-            struct CV_EXPORTS DTree\r
-            {\r
-                int nodeCount;\r
-            };\r
-\r
-            struct CV_EXPORTS Stage\r
-            {\r
-                int first;\r
-                int ntrees;\r
-                float threshold;\r
-            };\r
-\r
-            enum { BOOST = 0 };\r
-            enum { DO_CANNY_PRUNING = 1, SCALE_IMAGE = 2,FIND_BIGGEST_OBJECT = 4, DO_ROUGH_SEARCH = 8 };\r
-\r
-            CascadeClassifier();\r
-            CascadeClassifier(const string& filename);\r
-            ~CascadeClassifier();\r
+        public:            \r
+            CascadeClassifier_GPU();\r
+            CascadeClassifier_GPU(const string& filename);\r
+            ~CascadeClassifier_GPU();\r
 \r
             bool empty() const;\r
             bool load(const string& filename);\r
-            bool read(const FileNode& node);\r
-\r
-            void detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor=1.1,\r
-                int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size());\r
-\r
-            bool setImage( Ptr<FeatureEvaluator>&, const Mat& );\r
-            int runAt( Ptr<FeatureEvaluator>&, Point );\r
-\r
-            bool isStumpBased;\r
-\r
-            int stageType;\r
-            int featureType;\r
-            int ncategories;\r
-            Size origWinSize;\r
-\r
-            vector<Stage> stages;\r
-            vector<DTree> classifiers;\r
-            vector<DTreeNode> nodes;\r
-            vector<float> leaves;\r
-            vector<int> subsets;\r
+            void release();\r
+            \r
+            /* returns number of detected objects */\r
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());\r
+                                    \r
+            bool findLargestObject;\r
+            bool visualizeInPlace;\r
 \r
-            Ptr<FeatureEvaluator> feval;\r
-            Ptr<CvHaarClassifierCascade> oldCascade;\r
+            Size getClassifierSize() const;\r
+        private:\r
+            \r
+            struct CascadeClassifierImpl;                        \r
+            CascadeClassifierImpl* impl;            \r
         };\r
-\r
+        \r
         ////////////////////////////////// SURF //////////////////////////////////////////\r
         \r
         struct CV_EXPORTS SURFParams_GPU \r
         {\r
-            SURFParams_GPU() :\r
-                threshold(0.1f), \r
-                nOctaves(4),\r
-                nIntervals(4),\r
-                initialScale(2.f),\r
-\r
-                l1(3.f/1.5f),\r
-                l2(5.f/1.5f),\r
-                l3(3.f/1.5f),\r
-                l4(1.f/1.5f),\r
-                edgeScale(0.81f),\r
-                initialStep(1),\r
-\r
-                extended(true),\r
-\r
-                featuresRatio(0.01f)\r
-            {\r
-            }\r
+            SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), \r
+                l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),\r
+                edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}\r
 \r
             //! The interest operator threshold\r
             float threshold;\r
index 28acc6a..968e192 100644 (file)
@@ -170,8 +170,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
 \r
     if (src.type() == CV_8UC1)\r
     {\r
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,\r
-            nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );\r
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );\r
     }\r
     else\r
     {\r
@@ -186,8 +185,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
             pValues3[1] = nppLut3[1].ptr<Npp32s>();\r
             pValues3[2] = nppLut3[2].ptr<Npp32s>();\r
         }\r
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,\r
-            pValues3, lvls.pLevels3, lvls.nValues3) );\r
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );\r
     }\r
 }\r
 \r
index e6a4e72..3a7d2ca 100644 (file)
 \r
 #include "precomp.hpp"\r
 \r
-\r
-\r
-\r
 using namespace cv;\r
 using namespace cv::gpu;\r
 using namespace std;\r
 \r
-#if !defined (HAVE_CUDA)\r
 \r
-cv::gpu::CascadeClassifier::CascadeClassifier()  { throw_nogpu(); }\r
-cv::gpu::CascadeClassifier::CascadeClassifier(const string&)  { throw_nogpu(); }\r
-cv::gpu::CascadeClassifier::~CascadeClassifier()  { throw_nogpu(); }\r
+#if !defined (HAVE_CUDA) || (defined(_MSC_VER) && _MSC_VER != 1500) || !defined(_MSC_VER)\r
 \r
-bool cv::gpu::CascadeClassifier::empty() const { throw_nogpu(); return true; }\r
-bool cv::gpu::CascadeClassifier::load(const string& filename)  { throw_nogpu(); return true; }\r
-bool cv::gpu::CascadeClassifier::read(const FileNode& node)  { throw_nogpu(); return true; }\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string&)  { throw_nogpu(); }\r
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU()  { throw_nogpu(); }\r
 \r
-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat&, vector<Rect>&, double, int, int, Size, Size) { throw_nogpu(); }\r
+bool cv::gpu::CascadeClassifier_GPU::empty() const { throw_nogpu(); return true; }\r
+bool cv::gpu::CascadeClassifier_GPU::load(const string&)  { throw_nogpu(); return true; }\r
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_nogpu(); return Size(); }\r
 \r
-       \r
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& , GpuMat& , double , int , Size)  { throw_nogpu(); return 0; }\r
 \r
+#if defined (HAVE_CUDA)\r
+       NCVStatus loadFromXML(const string&, HaarClassifierCascadeDescriptor&, vector<HaarStage64>&, \r
+                                                 vector<HaarClassifierNode128>&, vector<HaarFeature64>&) { throw_nogpu(); return NCVStatus(); }\r
 \r
+       void groupRectangles(vector<NcvRect32u>&, int, double, vector<Ncv32u>*) { throw_nogpu(); }\r
+#endif\r
 \r
 #else\r
 \r
+struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl\r
+{    \r
+    CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)\r
+    {\r
+        ncvSetDebugOutputHandler(NCVDebugOutputHandler);            \r
+        if (ncvStat != load(filename))\r
+            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");\r
+    }    \r
+    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors, bool findLargestObject, bool visualizeInPlace, NcvSize32u ncvMinSize, /*out*/unsigned int& numDetections)\r
+    {   \r
+        calculateMemReqsAndAllocate(src.size());        \r
 \r
-cv::gpu::CascadeClassifier::CascadeClassifier()\r
-{\r
+        NCVMemPtr src_beg;\r
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();\r
+        src_beg.memtype = NCVMemoryTypeDevice;\r
 \r
-}\r
+        NCVMemSegment src_seg;\r
+        src_seg.begin = src_beg;\r
+        src_seg.size  = src.step * src.rows;\r
 \r
-cv::gpu::CascadeClassifier::CascadeClassifier(const string& filename)\r
-{\r
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, devProp.textureAlignment, src.cols, src.rows, src.step, true);        \r
+        \r
+        //NCVMatrixAlloc<Ncv8u> d_src(*gpuAllocator, src.cols, src.rows);\r
+        //ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
 \r
-}\r
+        //NCVMatrixAlloc<Ncv8u> h_src(*cpuAllocator, src.cols, src.rows);\r
+        //ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
 \r
-cv::gpu::CascadeClassifier::~CascadeClassifier()\r
-{\r
+        CV_Assert(objects.rows == 1);\r
+\r
+        NCVMemPtr objects_beg;\r
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();\r
+        objects_beg.memtype = NCVMemoryTypeDevice;\r
+\r
+        NCVMemSegment objects_seg;\r
+        objects_seg.begin = objects_beg;\r
+        objects_seg.size = objects.step * objects.rows;\r
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);\r
+        //NCVVectorAlloc<NcvRect32u> d_rects(*gpuAllocator, 100);        \r
+        //ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);        \r
+            \r
+        NcvSize32u roi;\r
+        roi.width = d_src.width();\r
+        roi.height = d_src.height();\r
+\r
+        Ncv32u flags = 0;\r
+        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;\r
+        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;\r
+        \r
+        ncvStat = ncvDetectObjectsMultiScale_device(\r
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,\r
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,\r
+            ncvMinSize,\r
+            minNeighbors,\r
+            scaleStep, 1,\r
+            flags,\r
+            *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor,  0);\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+                       \r
+        return NCV_SUCCESS;\r
+    }\r
+    ////\r
     \r
+    NcvSize32u getClassifierSize() const  { return haar.ClassifierSize; }\r
+    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }\r
+private:\r
+\r
+    static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); }\r
+\r
+    NCVStatus load(const string& classifierFile)\r
+    {        \r
+        int devId = cv::gpu::getDevice();           \r
+        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);\r
+\r
+        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator\r
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice);        \r
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);\r
+\r
+        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);\r
+        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);\r
+\r
+        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;\r
+        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);\r
+\r
+        h_haarStages   = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);        \r
+        h_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);\r
+        h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);\r
+\r
+        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);\r
+        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);        \r
+        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);\r
+\r
+        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);\r
+\r
+        d_haarStages   = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);\r
+        d_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);\r
+        d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);\r
+\r
+        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);\r
+        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);                        \r
+        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);\r
+\r
+        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);\r
+        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);\r
+        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);    \r
+\r
+        return NCV_SUCCESS;\r
+    }\r
+    ////\r
+\r
+    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)\r
+    {        \r
+        if (lastAllocatedFrameSize == frameSize)\r
+            return NCV_SUCCESS;\r
+\r
+        // Calculate memory requirements and create real allocators\r
+        NCVMemStackAllocator gpuCounter(devProp.textureAlignment);\r
+        NCVMemStackAllocator cpuCounter(devProp.textureAlignment);\r
+\r
+        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);        \r
+        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);\r
+        \r
+        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);\r
+        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);\r
+\r
+        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);        \r
+        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);        \r
+        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+        NcvSize32u roi;\r
+        roi.width = d_src.width();\r
+        roi.height = d_src.height();\r
+        Ncv32u numDetections;\r
+        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,\r
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);\r
+\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+                      \r
+        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);        \r
+        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);\r
+\r
+        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);\r
+        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);        \r
+        return NCV_SUCCESS;\r
+    }\r
+    //// \r
+\r
+    cudaDeviceProp devProp;\r
+    NCVStatus ncvStat;\r
+\r
+    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;        \r
+    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;\r
+\r
+    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;        \r
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;\r
+    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;\r
+\r
+    HaarClassifierCascadeDescriptor haar;\r
+\r
+    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;\r
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;\r
+    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;\r
+\r
+    Size lastAllocatedFrameSize;\r
+\r
+    Ptr<NCVMemStackAllocator> gpuAllocator;        \r
+    Ptr<NCVMemStackAllocator> cpuAllocator;\r
+};\r
+\r
+\r
+\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() : findLargestObject(false), visualizeInPlace(false), impl(0) {}\r
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename) : findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }\r
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }\r
+bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }\r
+\r
+void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }\r
+\r
+bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)\r
+{         \r
+    release();\r
+    impl = new CascadeClassifierImpl(filename);\r
+    return !this->empty();    \r
 }\r
 \r
-bool cv::gpu::CascadeClassifier::empty() const\r
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const\r
 {\r
-    int *a = (int*)&nppiStTranspose_32u_C1R;\r
-    return *a == 0xFFFFF;\r
-    return true;\r
+    return this->empty() ? Size() : impl->getClassifierCvSize();\r
 }\r
+                            \r
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)\r
+{     \r
+    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);\r
+    CV_Assert( !this->empty());\r
+        \r
+    const int defaultObjSearchNum = 100;\r
+    if (objectsBuf.empty())\r
+        objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);\r
+    \r
+    NcvSize32u ncvMinSize = impl->getClassifierSize();\r
 \r
-bool cv::gpu::CascadeClassifier::load(const string& filename)\r
-{\r
-    return true;\r
+    if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height)\r
+    {\r
+        ncvMinSize.width = minSize.width;\r
+        ncvMinSize.height = minSize.height;\r
+    }    \r
+                \r
+    unsigned int numDetections;\r
+    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);                 \r
+    if (ncvStat != NCV_SUCCESS)\r
+        CV_Error(CV_GpuApiCallError, "Error in face detectioln");\r
+\r
+    return numDetections;\r
 }\r
 \r
-bool cv::gpu::CascadeClassifier::read(const FileNode& node)\r
+       struct RectConvert\r
+       {\r
+               Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }\r
+               NcvRect32u operator()(const Rect& nr) const \r
+               { \r
+                       NcvRect32u rect;\r
+                       rect.x = nr.x;\r
+                       rect.y = nr.y;\r
+                       rect.width = nr.width;\r
+                       rect.height = nr.height;\r
+                       return rect; \r
+               }\r
+       };\r
+\r
+       void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)\r
+       {\r
+               vector<Rect> rects(hypotheses.size());    \r
+               std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());\r
+           \r
+               if (weights) \r
+               {\r
+                       vector<int> weights_int;\r
+                       weights_int.assign(weights->begin(), weights->end());        \r
+                       cv::groupRectangles(rects, weights_int, groupThreshold, eps);\r
+               }\r
+               else\r
+               {   \r
+                       cv::groupRectangles(rects, groupThreshold, eps);\r
+               }\r
+               std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());    \r
+               hypotheses.resize(rects.size());\r
+       }\r
+\r
+\r
+#if 1 /* loadFromXML implementation switch */\r
+\r
+NCVStatus loadFromXML(const std::string &filename, \r
+                      HaarClassifierCascadeDescriptor &haar, \r
+                      std::vector<HaarStage64> &haarStages, \r
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes, \r
+                      std::vector<HaarFeature64> &haarFeatures)\r
 {\r
-    return true;\r
+    NCVStatus ncvStat;\r
+\r
+    haar.NumStages = 0;\r
+    haar.NumClassifierRootNodes = 0;\r
+    haar.NumClassifierTotalNodes = 0;\r
+    haar.NumFeatures = 0;\r
+    haar.ClassifierSize.width = 0;\r
+    haar.ClassifierSize.height = 0;        \r
+    haar.bHasStumpsOnly = true;\r
+    haar.bNeedsTiltedII = false;\r
+    Ncv32u curMaxTreeDepth;\r
+\r
+    std::vector<char> xmlFileCont;   \r
+\r
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;\r
+    haarStages.resize(0);\r
+    haarClassifierNodes.resize(0);\r
+    haarFeatures.resize(0);    \r
+    \r
+    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);\r
+    if (oldCascade.empty())\r
+        return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+                                          \r
+    haar.ClassifierSize.width = oldCascade->orig_window_size.width;\r
+    haar.ClassifierSize.height = oldCascade->orig_window_size.height;\r
+\r
+    int stagesCound = oldCascade->count;\r
+    for(int s = 0; s < stagesCound; ++s) // by stages\r
+    {\r
+        HaarStage64 curStage;\r
+        curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());\r
+\r
+        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);\r
+\r
+        int treesCount = oldCascade->stage_classifier[s].count;\r
+        for(int t = 0; t < treesCount; ++t) // bytrees\r
+        {                                \r
+            Ncv32u nodeId = 0;\r
+            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];\r
+\r
+            int nodesCount = tree->count;\r
+            for(int n = 0; n < nodesCount; ++n)  //by features             \r
+            {   \r
+                CvHaarFeature* feature = &tree->haar_feature[n];\r
+\r
+                HaarClassifierNode128 curNode;                                        \r
+                curNode.setThreshold(tree->threshold[n]);\r
+                \r
+                HaarClassifierNodeDescriptor32 nodeLeft;\r
+                if ( tree->left[n] <= 0 )\r
+                {   \r
+                    Ncv32f leftVal = tree->alpha[-tree->left[n]];\r
+                    ncvStat = nodeLeft.create(leftVal);\r
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);                    \r
+                }\r
+                else\r
+                {   \r
+                    Ncv32u leftNodeOffset = tree->left[n];                        \r
+                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));\r
+                    haar.bHasStumpsOnly = false;\r
+                }\r
+                curNode.setLeftNodeDesc(nodeLeft);\r
+                \r
+                HaarClassifierNodeDescriptor32 nodeRight;\r
+                if ( tree->right[n] <= 0 )\r
+                {                                         \r
+                    Ncv32f rightVal = tree->alpha[-tree->right[n]];                        \r
+                    ncvStat = nodeRight.create(rightVal);\r
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+                }\r
+                else\r
+                {                                               \r
+                    Ncv32u rightNodeOffset = tree->right[n];                        \r
+                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));\r
+                    haar.bHasStumpsOnly = false;\r
+                }\r
+                curNode.setRightNodeDesc(nodeRight);                    \r
+\r
+                Ncv32u tiltedVal = feature->tilted;\r
+                haar.bNeedsTiltedII = (tiltedVal != 0);                \r
+\r
+                Ncv32u featureId = 0;                    \r
+                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects\r
+                {                        \r
+                    Ncv32u rectX = feature->rect[l].r.x; \r
+                    Ncv32u rectY = feature->rect[l].r.y;\r
+                    Ncv32u rectWidth = feature->rect[l].r.width;\r
+                    Ncv32u rectHeight = feature->rect[l].r.height;\r
+\r
+                    Ncv32f rectWeight = feature->rect[l].weight;\r
+\r
+                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)\r
+                        break;\r
+\r
+                    HaarFeature64 curFeature;\r
+                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);\r
+                    curFeature.setWeight(rectWeight);\r
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+                    haarFeatures.push_back(curFeature);\r
+\r
+                    featureId++;\r
+                }\r
+\r
+                HaarFeatureDescriptor32 tmpFeatureDesc;\r
+                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);\r
+                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+                curNode.setFeatureDesc(tmpFeatureDesc);\r
+\r
+                if (!nodeId)\r
+                {\r
+                    //root node\r
+                    haarClassifierNodes.push_back(curNode);\r
+                    curMaxTreeDepth = 1;\r
+                }\r
+                else\r
+                {\r
+                    //other node\r
+                    h_TmpClassifierNotRootNodes.push_back(curNode);\r
+                    curMaxTreeDepth++;\r
+                }\r
+\r
+                nodeId++;\r
+            }               \r
+        }\r
+\r
+        curStage.setNumClassifierRootNodes(treesCount);\r
+        haarStages.push_back(curStage);            \r
+    }\r
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+\r
+\r
+    //fill in cascade stats\r
+    haar.NumStages = haarStages.size();\r
+    haar.NumClassifierRootNodes = haarClassifierNodes.size();\r
+    haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();\r
+    haar.NumFeatures = haarFeatures.size();\r
+\r
+    //merge root and leaf nodes in one classifiers array\r
+    Ncv32u offsetRoot = haarClassifierNodes.size();\r
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)\r
+    {\r
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();\r
+        if (!nodeLeft.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+            nodeLeft.create(newOffset);\r
+        }\r
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();\r
+        if (!nodeRight.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+            nodeRight.create(newOffset);\r
+        }\r
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);\r
+    }\r
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)\r
+    {\r
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();\r
+        if (!nodeLeft.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+            nodeLeft.create(newOffset);\r
+        }\r
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();\r
+        if (!nodeRight.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+            nodeRight.create(newOffset);\r
+        }\r
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);\r
+\r
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);\r
+    }\r
+\r
+    return NCV_SUCCESS;\r
 }\r
 \r
-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor, \r
-    int minNeighbors, int flags, Size minSize, Size maxSize)\r
+////\r
+\r
+#else /* loadFromXML implementation switch */\r
 \r
+#include "e:/devNPP-OpenCV/src/external/_rapidxml-1.13/rapidxml.hpp"\r
+\r
+NCVStatus loadFromXML(const std::string &filename,\r
+                      HaarClassifierCascadeDescriptor &haar,\r
+                      std::vector<HaarStage64> &haarStages,\r
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+                      std::vector<HaarFeature64> &haarFeatures)\r
 {\r
+    NCVStatus ncvStat;\r
+\r
+    haar.NumStages = 0;\r
+    haar.NumClassifierRootNodes = 0;\r
+    haar.NumClassifierTotalNodes = 0;\r
+    haar.NumFeatures = 0;\r
+    haar.ClassifierSize.width = 0;\r
+    haar.ClassifierSize.height = 0;\r
+    haar.bNeedsTiltedII = false;\r
+    haar.bHasStumpsOnly = false;\r
+\r
+    FILE *fp;\r
+    fopen_s(&fp, filename.c_str(), "r");\r
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+\r
+    //get file size\r
+    fseek(fp, 0, SEEK_END);\r
+    Ncv32u xmlSize = ftell(fp);\r
+    fseek(fp, 0, SEEK_SET);\r
+\r
+    //load file to vector\r
+    std::vector<char> xmlFileCont;\r
+    xmlFileCont.resize(xmlSize+1);\r
+    memset(&xmlFileCont[0], 0, xmlSize+1);\r
+    fread_s(&xmlFileCont[0], xmlSize, 1, xmlSize, fp);\r
+    fclose(fp);\r
+\r
+    haar.bHasStumpsOnly = true;\r
+    haar.bNeedsTiltedII = false;\r
+    Ncv32u curMaxTreeDepth;\r
+\r
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;\r
+    haarStages.resize(0);\r
+    haarClassifierNodes.resize(0);\r
+    haarFeatures.resize(0);\r
+\r
+    //XML loading and OpenCV XML classifier syntax verification\r
+    try\r
+    {\r
+        rapidxml::xml_document<> doc;\r
+        doc.parse<0>(&xmlFileCont[0]);\r
+\r
+        //opencv_storage\r
+        rapidxml::xml_node<> *parserGlobal = doc.first_node();\r
+        ncvAssertReturn(!strcmp(parserGlobal->name(), "opencv_storage"), NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+        //classifier type\r
+        parserGlobal = parserGlobal->first_node();\r
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+        rapidxml::xml_attribute<> *attr = parserGlobal->first_attribute("type_id");\r
+        ncvAssertReturn(!strcmp(attr->value(), "opencv-haar-classifier"), NCV_HAAR_XML_LOADING_EXCEPTION);\r
 \r
+        //classifier size\r
+        parserGlobal = parserGlobal->first_node("size");\r
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+        sscanf_s(parserGlobal->value(), "%d %d", &(haar.ClassifierSize.width), &(haar.ClassifierSize.height));\r
+\r
+        //parse stages\r
+        parserGlobal = parserGlobal->next_sibling("stages");\r
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+        parserGlobal = parserGlobal->first_node("_");\r
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+        while (parserGlobal)\r
+        {\r
+            HaarStage64 curStage;\r
+            curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());\r
+            Ncv32u tmpNumClassifierRootNodes = 0;\r
+\r
+            rapidxml::xml_node<> *parserStageThreshold = parserGlobal->first_node("stage_threshold");\r
+            ncvAssertReturn(parserStageThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+            Ncv32f tmpStageThreshold;\r
+            sscanf_s(parserStageThreshold->value(), "%f", &tmpStageThreshold);\r
+            curStage.setStageThreshold(tmpStageThreshold);\r
+\r
+            //parse trees\r
+            rapidxml::xml_node<> *parserTree;\r
+            parserTree = parserGlobal->first_node("trees");\r
+            ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+            parserTree = parserTree->first_node("_");\r
+            ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+            while (parserTree)\r
+            {\r
+                rapidxml::xml_node<> *parserNode;\r
+                parserNode = parserTree->first_node("_");\r
+                ncvAssertReturn(parserNode, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                Ncv32u nodeId = 0;\r
+\r
+                while (parserNode)\r
+                {\r
+                    HaarClassifierNode128 curNode;\r
+\r
+                    rapidxml::xml_node<> *parserNodeThreshold = parserNode->first_node("threshold");\r
+                    ncvAssertReturn(parserNodeThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                    Ncv32f tmpThreshold;\r
+                    sscanf_s(parserNodeThreshold->value(), "%f", &tmpThreshold);\r
+                    curNode.setThreshold(tmpThreshold);\r
+\r
+                    rapidxml::xml_node<> *parserNodeLeft = parserNode->first_node("left_val");\r
+                    HaarClassifierNodeDescriptor32 nodeLeft;\r
+                    if (parserNodeLeft)\r
+                    {\r
+                        Ncv32f leftVal;\r
+                        sscanf_s(parserNodeLeft->value(), "%f", &leftVal);\r
+                        ncvStat = nodeLeft.create(leftVal);\r
+                        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+                    }\r
+                    else\r
+                    {\r
+                        parserNodeLeft = parserNode->first_node("left_node");\r
+                        ncvAssertReturn(parserNodeLeft, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                        Ncv32u leftNodeOffset;\r
+                        sscanf_s(parserNodeLeft->value(), "%d", &leftNodeOffset);\r
+                        nodeLeft.create(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1);\r
+                        haar.bHasStumpsOnly = false;\r
+                    }\r
+                    curNode.setLeftNodeDesc(nodeLeft);\r
+\r
+                    rapidxml::xml_node<> *parserNodeRight = parserNode->first_node("right_val");\r
+                    HaarClassifierNodeDescriptor32 nodeRight;\r
+                    if (parserNodeRight)\r
+                    {\r
+                        Ncv32f rightVal;\r
+                        sscanf_s(parserNodeRight->value(), "%f", &rightVal);\r
+                        ncvStat = nodeRight.create(rightVal);\r
+                        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+                    }\r
+                    else\r
+                    {\r
+                        parserNodeRight = parserNode->first_node("right_node");\r
+                        ncvAssertReturn(parserNodeRight, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                        Ncv32u rightNodeOffset;\r
+                        sscanf_s(parserNodeRight->value(), "%d", &rightNodeOffset);\r
+                        nodeRight.create(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1);\r
+                        haar.bHasStumpsOnly = false;\r
+                    }\r
+                    curNode.setRightNodeDesc(nodeRight);\r
+\r
+                    rapidxml::xml_node<> *parserNodeFeatures = parserNode->first_node("feature");\r
+                    ncvAssertReturn(parserNodeFeatures, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+\r
+                    rapidxml::xml_node<> *parserNodeFeaturesTilted = parserNodeFeatures->first_node("tilted");\r
+                    ncvAssertReturn(parserNodeFeaturesTilted, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                    Ncv32u tiltedVal;\r
+                    sscanf_s(parserNodeFeaturesTilted->value(), "%d", &tiltedVal);\r
+                    haar.bNeedsTiltedII = (tiltedVal != 0);\r
+\r
+                    rapidxml::xml_node<> *parserNodeFeaturesRects = parserNodeFeatures->first_node("rects");\r
+                    ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                    parserNodeFeaturesRects = parserNodeFeaturesRects->first_node("_");\r
+                    ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);\r
+                    Ncv32u featureId = 0;\r
+\r
+                    while (parserNodeFeaturesRects)\r
+                    {\r
+                        Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+                        Ncv32f rectWeight;\r
+                        sscanf_s(parserNodeFeaturesRects->value(), "%d %d %d %d %f", &rectX, &rectY, &rectWidth, &rectHeight, &rectWeight);\r
+                        HaarFeature64 curFeature;\r
+                        ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);\r
+                        curFeature.setWeight(rectWeight);\r
+                        ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+                        haarFeatures.push_back(curFeature);\r
+\r
+                        parserNodeFeaturesRects = parserNodeFeaturesRects->next_sibling("_");\r
+                        featureId++;\r
+                    }\r
+\r
+                    HaarFeatureDescriptor32 tmpFeatureDesc;\r
+                    ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);\r
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);\r
+                    curNode.setFeatureDesc(tmpFeatureDesc);\r
+\r
+                    if (!nodeId)\r
+                    {\r
+                        //root node\r
+                        haarClassifierNodes.push_back(curNode);\r
+                        curMaxTreeDepth = 1;\r
+                    }\r
+                    else\r
+                    {\r
+                        //other node\r
+                        h_TmpClassifierNotRootNodes.push_back(curNode);\r
+                        curMaxTreeDepth++;\r
+                    }\r
+\r
+                    parserNode = parserNode->next_sibling("_");\r
+                    nodeId++;\r
+                }\r
+\r
+                parserTree = parserTree->next_sibling("_");\r
+                tmpNumClassifierRootNodes++;\r
+            }\r
+\r
+            curStage.setNumClassifierRootNodes(tmpNumClassifierRootNodes);\r
+            haarStages.push_back(curStage);\r
+\r
+            parserGlobal = parserGlobal->next_sibling("_");\r
+        }\r
+    }\r
+    catch (...)\r
+    {\r
+        return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+    }\r
+\r
+    //fill in cascade stats\r
+    haar.NumStages = haarStages.size();\r
+    haar.NumClassifierRootNodes = haarClassifierNodes.size();\r
+    haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();\r
+    haar.NumFeatures = haarFeatures.size();\r
+\r
+    //merge root and leaf nodes in one classifiers array\r
+    Ncv32u offsetRoot = haarClassifierNodes.size();\r
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)\r
+    {\r
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();\r
+        if (!nodeLeft.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+            nodeLeft.create(newOffset);\r
+        }\r
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();\r
+        if (!nodeRight.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+            nodeRight.create(newOffset);\r
+        }\r
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);\r
+    }\r
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)\r
+    {\r
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();\r
+        if (!nodeLeft.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;\r
+            nodeLeft.create(newOffset);\r
+        }\r
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);\r
+\r
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();\r
+        if (!nodeRight.isLeaf())\r
+        {\r
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;\r
+            nodeRight.create(newOffset);\r
+        }\r
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);\r
+\r
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);\r
+    }\r
+\r
+    return NCV_SUCCESS;\r
 }\r
 \r
-#endif
\ No newline at end of file
+#endif /* loadFromXML implementation switch */\r
+\r
+#endif /* HAVE_CUDA */\r
+\r
+\r
index 6bf060d..e82700e 100644 (file)
@@ -62,7 +62,7 @@ namespace cv
             BORDER_REPLICATE_GPU,\r
             BORDER_CONSTANT_GPU\r
         };\r
-        \r
+                \r
         // Converts CPU border extrapolation mode into GPU internal analogue.\r
         // Returns true if the GPU analogue exists, false otherwise.\r
         bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);\r
@@ -105,8 +105,28 @@ namespace cv
             const textureReference* tex; \r
             cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
             cudaSafeCall( cudaUnbindTexture(tex) );\r
-        }        \r
+        }    \r
+\r
+        struct KeyPoint_GPU\r
+        {\r
+            float x;\r
+            float y;\r
+            float size;\r
+            float response;\r
+            float angle;\r
+            float octave;\r
+        };\r
 \r
+        enum KeypointLayout \r
+        {\r
+            SF_X,\r
+            SF_Y,\r
+            SF_SIZE,\r
+            SF_RESPONSE,\r
+            SF_ANGLE,\r
+            SF_OCTAVE,\r
+            SF_FEATURE_STRIDE\r
+        };\r
     }\r
 }\r
 \r
index aa3612e..3137004 100644 (file)
@@ -47,29 +47,7 @@ namespace cv
 {\r
     namespace gpu\r
     {\r
-        namespace surf\r
-        {\r
-            struct KeyPoint_GPU\r
-            {\r
-                float x;\r
-                float y;\r
-                float size;\r
-                float response;\r
-                float angle;\r
-                float octave;\r
-            };\r
-\r
-            enum KeypointLayout \r
-            {\r
-                SF_X,\r
-                SF_Y,\r
-                SF_SIZE,\r
-                SF_RESPONSE,\r
-                SF_ANGLE,\r
-                SF_OCTAVE,\r
-                SF_FEATURE_STRIDE\r
-            };\r
-        }\r
+       \r
     }\r
 }\r
 \r
index 78c9b6d..4d61cc3 100644 (file)
@@ -82,21 +82,16 @@ void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu()
 \r
 namespace\r
 {\r
-    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,\r
-                                         NppiSize oSizeROI, int nScaleFactor);\r
-    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,\r
-                                          int nDstStep, NppiSize oSizeROI);\r
-    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,\r
-                                          int nDstStep, NppiSize oSizeROI);\r
+    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
+    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI);\r
+    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);\r
 \r
     void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,\r
                          npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,\r
                          npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)\r
     {\r
         CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());\r
-\r
         CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);\r
-\r
         dst.create( src1.size(), src1.type() );\r
 \r
         NppiSize sz;\r
@@ -106,24 +101,16 @@ namespace
         switch (src1.type())\r
         {\r
         case CV_8UC1:\r
-            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,\r
-                src2.ptr<Npp8u>(), src2.step,\r
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
+            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
             break;\r
         case CV_8UC4:\r
-            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,\r
-                src2.ptr<Npp8u>(), src2.step,\r
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
+            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );\r
             break;\r
         case CV_32SC1:\r
-            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,\r
-                src2.ptr<Npp32s>(), src2.step,\r
-                dst.ptr<Npp32s>(), dst.step, sz) );\r
+            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );\r
             break;\r
         case CV_32FC1:\r
-            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,\r
-                src2.ptr<Npp32f>(), src2.step,\r
-                dst.ptr<Npp32f>(), dst.step, sz) );\r
+            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );\r
             break;\r
         default:\r
             CV_Assert(!"Unsupported source type");\r
@@ -133,16 +120,15 @@ namespace
     template<int SCN> struct NppArithmScalarFunc;\r
     template<> struct NppArithmScalarFunc<1>\r
     {\r
-        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,\r
-                                      int nDstStep, NppiSize oSizeROI);\r
+        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);\r
     };\r
     template<> struct NppArithmScalarFunc<2>\r
     {\r
-        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,\r
-                                      int nDstStep, NppiSize oSizeROI);\r
+        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, int nDstStep, NppiSize oSizeROI);\r
     };\r
 \r
     template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;\r
+\r
     template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>\r
     {\r
         static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)\r
@@ -254,24 +240,16 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
     switch (src1.type())\r
     {\r
     case CV_8UC1:\r
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,\r
-            src2.ptr<Npp8u>(), src2.step,\r
-            dst.ptr<Npp8u>(), dst.step, sz) );\r
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );\r
         break;\r
     case CV_8UC4:\r
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,\r
-            src2.ptr<Npp8u>(), src2.step,\r
-            dst.ptr<Npp8u>(), dst.step, sz) );\r
+        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );\r
         break;\r
     case CV_32SC1:\r
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,\r
-            src2.ptr<Npp32s>(), src2.step,\r
-            dst.ptr<Npp32s>(), dst.step, sz) );\r
+        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );\r
         break;\r
     case CV_32FC1:\r
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,\r
-            src2.ptr<Npp32f>(), src2.step,\r
-            dst.ptr<Npp32f>(), dst.step, sz) );\r
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );\r
         break;\r
     default:\r
         CV_Assert(!"Unsupported source type");\r
diff --git a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
new file mode 100644 (file)
index 0000000..c1926a3
--- /dev/null
@@ -0,0 +1,362 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include <cstdio>\r
+#include <cuda_runtime.h>\r
+\r
+#define CV_NO_BACKWARD_COMPATIBILITY\r
+\r
+#include "opencv2/opencv.hpp"\r
+\r
+#include "NCVHaarObjectDetection.hpp"\r
+\r
+using namespace cv;\r
+using namespace std;\r
+\r
+const Size preferredVideoFrameSize(640, 480);\r
+\r
+string preferredClassifier = "haarcascade_frontalface_alt.xml";\r
+string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";\r
+\r
+\r
+void printSyntax(void)\r
+{\r
+    printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");\r
+}\r
+\r
+\r
+void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)\r
+{    \r
+    int fontFace = CV_FONT_HERSHEY_PLAIN;\r
+    double fontScale = 1;       \r
+    \r
+    int baseline;\r
+    Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline);\r
+\r
+    va_list arg_ptr;\r
+    va_start(arg_ptr, format);\r
+    int len = _vscprintf(format, arg_ptr) + 1;\r
+    \r
+    vector<char> strBuf(len);    \r
+    vsprintf_s(&strBuf[0], len, format, arg_ptr);\r
+\r
+    Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2);    \r
+    putText(img, &strBuf[0], org, fontFace, fontScale, color);\r
+    va_end(arg_ptr);    \r
+}\r
+\r
+\r
+NCVStatus process(Mat *srcdst,\r
+                  Ncv32u width, Ncv32u height,\r
+                  NcvBool bShowAllHypotheses, NcvBool bLargestFace,\r
+                  HaarClassifierCascadeDescriptor &haar,\r
+                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,\r
+                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,\r
+                  INCVMemAllocator &gpuAllocator,\r
+                  INCVMemAllocator &cpuAllocator,\r
+                  cudaDeviceProp &devProp)\r
+{\r
+    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);\r
+\r
+    NCVStatus ncvStat;\r
+\r
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);\r
+    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);\r
+    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);        \r
+    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());\r
+\r
+    NCV_SKIP_COND_BEGIN        \r
+    \r
+    (*srcdst).copyTo(h_src_hdr);\r
+    \r
+    ncvStat = h_src.copySolid(d_src, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+    NCV_SKIP_COND_END\r
+\r
+    NcvSize32u roi;\r
+    roi.width = d_src.width();\r
+    roi.height = d_src.height();\r
+\r
+    Ncv32u numDetections;\r
+    ncvStat = ncvDetectObjectsMultiScale_device(\r
+        d_src, roi, d_rects, numDetections, haar, h_haarStages,\r
+        d_haarStages, d_haarNodes, d_haarFeatures,\r
+        haar.ClassifierSize,\r
+        bShowAllHypotheses ? 0 : 4,\r
+        1.2f, 1,\r
+        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,\r
+        gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+\r
+    ncvStat = d_src.copySolid(h_src, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+    h_src_hdr.copyTo(*srcdst);\r
+    \r
+    NCV_SKIP_COND_END\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+int main( int argc, const char** argv )\r
+{\r
+    NCVStatus ncvStat;\r
+\r
+    printf("NVIDIA Computer Vision SDK\n");\r
+    printf("Face Detection in video and live feed\n");\r
+    printf("=========================================\n");\r
+    printf("  Esc   - Quit\n");\r
+    printf("  Space - Switch between NCV and OpenCV\n");\r
+    printf("  L     - Switch between FullSearch and LargestFace modes\n");\r
+    printf("  U     - Toggle unfiltered hypotheses visualization in FullSearch\n");\r
+    \r
+    if (argc != 4 && argc != 1)\r
+        return printSyntax(), -1;\r
+\r
+    VideoCapture capture;    \r
+    Size frameSize;\r
+\r
+    if (argc == 1 || strcmp(argv[1], "-c") == 0)\r
+    {\r
+        // Camera input is specified\r
+        int camIdx = (argc == 3) ? atoi(argv[2]) : 0;\r
+        if(!capture.open(camIdx))        \r
+            return printf("Error opening camera\n"), -1;        \r
+            \r
+        capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width);\r
+        capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height);\r
+        capture.set(CV_CAP_PROP_FPS, 25);\r
+        frameSize = preferredVideoFrameSize;\r
+    }\r
+    else if (strcmp(argv[1], "-v") == 0)\r
+    {\r
+        // Video file input (avi)\r
+        if(!capture.open(argv[2]))\r
+            return printf("Error opening video file\n"), -1;\r
+\r
+        frameSize.width  = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH);\r
+        frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT);\r
+    }\r
+    else\r
+        return printSyntax(), -1;\r
+\r
+    NcvBool bUseOpenCV = true;\r
+    NcvBool bLargestFace = true;\r
+    NcvBool bShowAllHypotheses = false;    \r
+\r
+    string classifierFile = (argc == 1) ? preferredClassifier : argv[3];\r
+    \r
+    CascadeClassifier classifierOpenCV;\r
+    if (!classifierOpenCV.load(classifierFile))\r
+        return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;\r
+\r
+    int devId;\r
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);\r
+    cudaDeviceProp devProp;\r
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);\r
+    printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor);\r
+\r
+    //==============================================================================\r
+    //\r
+    // Load the classifier from file (assuming its size is about 1 mb)\r
+    // using a simple allocator\r
+    //\r
+    //==============================================================================\r
+\r
+    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);\r
+    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);\r
+    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);\r
+    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);\r
+\r
+    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;\r
+    ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);\r
+\r
+    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);\r
+    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);\r
+    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);\r
+    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);\r
+\r
+    HaarClassifierCascadeDescriptor haar;\r
+    ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);\r
+\r
+    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);\r
+    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);\r
+    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);\r
+    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);\r
+\r
+    ncvStat = h_haarStages.copySolid(d_haarStages, 0);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);\r
+\r
+    //==============================================================================\r
+    //\r
+    // Calculate memory requirements and create real allocators\r
+    //\r
+    //==============================================================================\r
+\r
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);\r
+    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);\r
+    NCVMemStackAllocator cpuCounter(devProp.textureAlignment);\r
+    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);\r
+\r
+    ncvStat = process(NULL, frameSize.width, frameSize.height,\r
+                      false, false, haar,\r
+                      d_haarStages, d_haarNodes,\r
+                      d_haarFeatures, h_haarStages,\r
+                      gpuCounter, cpuCounter, devProp);\r
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);\r
+\r
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);\r
+    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);\r
+    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);\r
+    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);\r
+\r
+    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);\r
+\r
+    //==============================================================================\r
+    //\r
+    // Main processing loop\r
+    //\r
+    //==============================================================================\r
+    \r
+    namedWindow(wndTitle, 1);\r
+\r
+    Mat frame, gray, frameDisp;\r
+\r
+    for(;;)\r
+    {\r
+        // For camera and video file, capture the next image                \r
+        capture >> frame;\r
+        if (frame.empty())\r
+            break;\r
+                \r
+        cvtColor(frame, gray, CV_BGR2GRAY);\r
+\r
+        // process\r
+        NcvSize32u minSize = haar.ClassifierSize;\r
+        if (bLargestFace)\r
+        {\r
+            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;\r
+            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;\r
+            Ncv32u ratioSmallest = std::min(ratioX, ratioY);\r
+            ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);\r
+            minSize.width *= ratioSmallest;\r
+            minSize.height *= ratioSmallest;\r
+        }\r
+        \r
+        NcvTimer timer = ncvStartTimer();\r
+\r
+        if (!bUseOpenCV)\r
+        {\r
+            ncvStat = process(&gray, frameSize.width, frameSize.height,\r
+                              bShowAllHypotheses, bLargestFace, haar,\r
+                              d_haarStages, d_haarNodes,\r
+                              d_haarFeatures, h_haarStages,\r
+                              gpuAllocator, cpuAllocator, devProp);\r
+            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);\r
+        }\r
+        else\r
+        {\r
+            vector<Rect> rectsOpenCV;\r
+\r
+            classifierOpenCV.detectMultiScale(\r
+                gray,\r
+                rectsOpenCV,\r
+                1.2f,\r
+                bShowAllHypotheses && !bLargestFace ? 0 : 4,\r
+                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,\r
+                Size(minSize.width, minSize.height));\r
+\r
+            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)\r
+                rectangle(gray, rectsOpenCV[rt], Scalar(255));\r
+        }\r
+\r
+        Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);\r
+        \r
+        cvtColor(gray, frameDisp, CV_GRAY2BGR);\r
+\r
+        imagePrintf(frameDisp, 0, CV_RGB(255,  0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");\r
+        imagePrintf(frameDisp, 1, CV_RGB(255,  0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":"");\r
+        imagePrintf(frameDisp, 2, CV_RGB(255,  0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)");\r
+        imagePrintf(frameDisp, 3, CV_RGB(118,185,0), "   Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU");\r
+\r
+        cv::imshow(wndTitle, frameDisp);\r
+\r
+        switch (cvWaitKey(1))\r
+        {\r
+        case ' ':\r
+            bUseOpenCV = !bUseOpenCV;\r
+            break;\r
+        case 'L':case 'l':\r
+            bLargestFace = !bLargestFace;\r
+            break;\r
+        case 'U':case 'u':\r
+            bShowAllHypotheses = !bShowAllHypotheses;\r
+            break;\r
+        case 27:\r
+            return 0;            \r
+        }\r
+    }\r
+        \r
+    return 0;\r
+}\r
diff --git a/modules/gpu/src/nvidia/NCV.cpp b/modules/gpu/src/nvidia/NCV.cpp
new file mode 100644 (file)
index 0000000..fef9c08
--- /dev/null
@@ -0,0 +1,571 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+\r
+#include <precomp.hpp>\r
+\r
+\r
+#if !defined (HAVE_CUDA)\r
+\r
+\r
+#else /* !defined (HAVE_CUDA) */\r
+\r
+\r
+#include <stdarg.h>\r
+#include "NCV.hpp"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Error handling helpers\r
+//\r
+//==============================================================================\r
+\r
+\r
+static void stdioDebugOutput(const char *msg)\r
+{\r
+    printf("%s", msg);\r
+}\r
+\r
+\r
+static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;\r
+\r
+\r
+void ncvDebugOutput(const char *msg, ...)\r
+{\r
+    const int K_DEBUG_STRING_MAXLEN = 1024;\r
+    char buffer[K_DEBUG_STRING_MAXLEN];\r
+    va_list args;\r
+    va_start(args, msg);\r
+    vsnprintf_s(buffer, K_DEBUG_STRING_MAXLEN, K_DEBUG_STRING_MAXLEN-1, msg, args);\r
+    va_end (args);\r
+    debugOutputHandler(buffer);\r
+}\r
+\r
+\r
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)\r
+{\r
+    debugOutputHandler = func;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Memory wrappers and helpers\r
+//\r
+//==============================================================================\r
+\r
+\r
+NCVStatus GPUAlignmentValue(Ncv32u &alignment)\r
+{\r
+    int curDev;\r
+    cudaDeviceProp curProp;\r
+    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);\r
+    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)\r
+{\r
+    Ncv32u alignMask = alignment-1;\r
+    Ncv32u inverseAlignMask = ~alignMask;\r
+    Ncv32u res = (what + alignMask) & inverseAlignMask;\r
+    return res;\r
+}\r
+\r
+\r
+void NCVMemPtr::clear()\r
+{\r
+    ptr = NULL;\r
+    memtype = NCVMemoryTypeNone;\r
+}\r
+\r
+\r
+void NCVMemSegment::clear()\r
+{\r
+    begin.clear();\r
+    size = 0;\r
+}\r
+\r
+\r
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)\r
+{\r
+    NCVStatus ncvStat;\r
+    switch (dstType)\r
+    {\r
+    case NCVMemoryTypeHostPageable:\r
+    case NCVMemoryTypeHostPinned:\r
+        switch (srcType)\r
+        {\r
+        case NCVMemoryTypeHostPageable:\r
+        case NCVMemoryTypeHostPinned:\r
+            memcpy(dst, src, sz);\r
+            ncvStat = NCV_SUCCESS;\r
+            break;\r
+        case NCVMemoryTypeDevice:\r
+            if (cuStream != 0)\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+            }\r
+            else\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+            }\r
+            ncvStat = NCV_SUCCESS;\r
+            break;\r
+        default:\r
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+        }\r
+        break;\r
+    case NCVMemoryTypeDevice:\r
+        switch (srcType)\r
+        {\r
+        case NCVMemoryTypeHostPageable:\r
+        case NCVMemoryTypeHostPinned:\r
+            if (cuStream != 0)\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+            }\r
+            else\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);\r
+            }\r
+            ncvStat = NCV_SUCCESS;\r
+            break;\r
+        case NCVMemoryTypeDevice:\r
+            if (cuStream != 0)\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);\r
+            }\r
+            else\r
+            {\r
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);\r
+            }\r
+            ncvStat = NCV_SUCCESS;\r
+            break;\r
+        default:\r
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+        }\r
+        break;\r
+    default:\r
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;\r
+    }\r
+\r
+    return ncvStat;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// NCVMemStackAllocator class members implementation\r
+//\r
+//===================================================================\r
+\r
+\r
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)\r
+    :\r
+    currentSize(0),\r
+    _maxSize(0),\r
+    allocBegin(NULL),\r
+    begin(NULL),\r
+    _memType(NCVMemoryTypeNone),\r
+    _alignment(alignment)\r
+{\r
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");\r
+}\r
+\r
+\r
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)\r
+    :\r
+    currentSize(0),\r
+    _maxSize(0),\r
+    allocBegin(NULL),\r
+    _memType(memT),\r
+    _alignment(alignment)\r
+{\r
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;\r
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");\r
+\r
+    allocBegin = NULL;\r
+\r
+    switch (memT)\r
+    {\r
+    case NCVMemoryTypeDevice:\r
+        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );\r
+        break;\r
+    case NCVMemoryTypeHostPinned:\r
+        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );\r
+        break;\r
+    case NCVMemoryTypeHostPageable:\r
+        allocBegin = (Ncv8u *)malloc(capacity);\r
+        break;\r
+    }\r
+\r
+    if (capacity == 0)\r
+    {\r
+        allocBegin = (Ncv8u *)(0x1);\r
+    }\r
+\r
+    if (!isCounting())\r
+    {\r
+        begin = allocBegin;\r
+        end = begin + capacity;\r
+    }\r
+}\r
+\r
+\r
+NCVMemStackAllocator::~NCVMemStackAllocator()\r
+{\r
+    if (allocBegin != NULL)\r
+    {\r
+        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");\r
+        switch (_memType)\r
+        {\r
+        case NCVMemoryTypeDevice:\r
+            ncvAssertCUDAReturn(cudaFree(allocBegin), );\r
+            break;\r
+        case NCVMemoryTypeHostPinned:\r
+            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );\r
+            break;\r
+        case NCVMemoryTypeHostPageable:\r
+            free(allocBegin);\r
+            break;\r
+        }\r
+        allocBegin = NULL;\r
+    }\r
+}\r
+\r
+\r
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)\r
+{\r
+    seg.clear();\r
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    size = alignUp(size, this->_alignment);\r
+    this->currentSize += size;\r
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);\r
+\r
+    if (!isCounting())\r
+    {\r
+        size_t availSize = end - begin;\r
+        ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);\r
+    }\r
+\r
+    seg.begin.ptr = begin;\r
+    seg.begin.memtype = this->_memType;\r
+    seg.size = size;\r
+    begin += size;\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)\r
+{\r
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);\r
+    ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);\r
+    ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);\r
+\r
+    currentSize -= seg.size;\r
+    begin -= seg.size;\r
+\r
+    seg.clear();\r
+\r
+    ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NcvBool NCVMemStackAllocator::isInitialized(void) const\r
+{\r
+    return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL;\r
+}\r
+\r
+\r
+NcvBool NCVMemStackAllocator::isCounting(void) const\r
+{\r
+    return this->_memType == NCVMemoryTypeNone;\r
+}\r
+\r
+\r
+NCVMemoryType NCVMemStackAllocator::memType(void) const\r
+{\r
+    return this->_memType;\r
+}\r
+\r
+\r
+Ncv32u NCVMemStackAllocator::alignment(void) const\r
+{\r
+    return this->_alignment;\r
+}\r
+\r
+\r
+size_t NCVMemStackAllocator::maxSize(void) const\r
+{\r
+    return this->_maxSize;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// NCVMemNativeAllocator class members implementation\r
+//\r
+//===================================================================\r
+\r
+\r
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)\r
+    :\r
+    currentSize(0),\r
+    _maxSize(0),\r
+    _memType(memT)\r
+{\r
+    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );\r
+    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );\r
+}\r
+\r
+\r
+NCVMemNativeAllocator::~NCVMemNativeAllocator()\r
+{\r
+    ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");\r
+}\r
+\r
+\r
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)\r
+{\r
+    seg.clear();\r
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    switch (this->_memType)\r
+    {\r
+    case NCVMemoryTypeDevice:\r
+        ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);\r
+        break;\r
+    case NCVMemoryTypeHostPinned:\r
+        ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);\r
+        break;\r
+    case NCVMemoryTypeHostPageable:\r
+        seg.begin.ptr = (Ncv8u *)malloc(size);\r
+        break;\r
+    }\r
+\r
+    this->currentSize += alignUp(size, this->_alignment);\r
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);\r
+\r
+    seg.begin.memtype = this->_memType;\r
+    seg.size = size;\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)\r
+{\r
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);\r
+    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);\r
+\r
+    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);\r
+    currentSize -= alignUp(seg.size, this->_alignment);\r
+\r
+    switch (this->_memType)\r
+    {\r
+    case NCVMemoryTypeDevice:\r
+        ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);\r
+        break;\r
+    case NCVMemoryTypeHostPinned:\r
+        ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);\r
+        break;\r
+    case NCVMemoryTypeHostPageable:\r
+        free(seg.begin.ptr);\r
+        break;\r
+    }\r
+\r
+    seg.clear();\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NcvBool NCVMemNativeAllocator::isInitialized(void) const\r
+{\r
+    return (this->_alignment != 0);\r
+}\r
+\r
+\r
+NcvBool NCVMemNativeAllocator::isCounting(void) const\r
+{\r
+    return false;\r
+}\r
+\r
+\r
+NCVMemoryType NCVMemNativeAllocator::memType(void) const\r
+{\r
+    return this->_memType;\r
+}\r
+\r
+\r
+Ncv32u NCVMemNativeAllocator::alignment(void) const\r
+{\r
+    return this->_alignment;\r
+}\r
+\r
+\r
+size_t NCVMemNativeAllocator::maxSize(void) const\r
+{\r
+    return this->_maxSize;\r
+}\r
+\r
+\r
+//===================================================================\r
+//\r
+// Time and timer routines\r
+//\r
+//===================================================================\r
+\r
+\r
+typedef struct _NcvTimeMoment NcvTimeMoment;\r
+\r
+#if defined(_WIN32) || defined(_WIN64)\r
+\r
+    #include <Windows.h>\r
+\r
+    typedef struct _NcvTimeMoment\r
+    {\r
+        LONGLONG moment, freq;\r
+    } NcvTimeMoment;\r
+\r
+\r
+    static void _ncvQueryMoment(NcvTimeMoment *t)\r
+    {\r
+        QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));\r
+        QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));\r
+    }\r
+\r
+\r
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)\r
+    {\r
+        return 1000000.0 * t->moment / t->freq;\r
+    }\r
+\r
+\r
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+    {\r
+        return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);\r
+    }\r
+\r
+\r
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+    {\r
+        return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);\r
+    }\r
+\r
+#elif defined(__unix__)\r
+\r
+    #include <sys/time.h>\r
+\r
+    typedef struct _NcvTimeMoment\r
+    {\r
+        struct timeval tv; \r
+        struct timezone tz;\r
+    } NcvTimeMoment;\r
+\r
+\r
+    void _ncvQueryMoment(NcvTimeMoment *t)\r
+    {\r
+        gettimeofday(& t->tv, & t->tz);\r
+    }\r
+\r
+\r
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)\r
+    {\r
+        return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;\r
+    }\r
+\r
+\r
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)\r
+    {\r
+        return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);\r
+    }\r
+\r
+\r
+#endif //#if defined(_WIN32) || defined(_WIN64)\r
+\r
+\r
+struct _NcvTimer\r
+{\r
+    NcvTimeMoment t1, t2;\r
+};\r
+\r
+\r
+NcvTimer ncvStartTimer(void)\r
+{\r
+    struct _NcvTimer *t;\r
+    t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));\r
+    _ncvQueryMoment(&t->t1);\r
+    return t;\r
+}\r
+\r
+\r
+double ncvEndQueryTimerUs(NcvTimer t)\r
+{\r
+    double res;\r
+    _ncvQueryMoment(&t->t2);\r
+    res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);\r
+    free(t);\r
+    return res;\r
+}\r
+\r
+\r
+double ncvEndQueryTimerMs(NcvTimer t)\r
+{\r
+    double res;\r
+    _ncvQueryMoment(&t->t2);\r
+    res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);\r
+    free(t);\r
+    return res;\r
+}\r
+\r
+#endif /* !defined (HAVE_CUDA) */
\ No newline at end of file
diff --git a/modules/gpu/src/nvidia/NCV.hpp b/modules/gpu/src/nvidia/NCV.hpp
new file mode 100644 (file)
index 0000000..a71f650
--- /dev/null
@@ -0,0 +1,837 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#ifndef _ncv_hpp_\r
+#define _ncv_hpp_\r
+\r
+#include <cuda_runtime.h>\r
+#include "npp_staging.h"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Alignment macros\r
+//\r
+//==============================================================================\r
+\r
+\r
+#if !defined(__align__) && !defined(__CUDACC__)\r
+    #if defined(_WIN32) || defined(_WIN64)\r
+        #define __align__(n)         __declspec(align(n))\r
+    #elif defined(__unix__)\r
+        #define __align__(n)         __attribute__((__aligned__(n)))\r
+    #endif\r
+#endif\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Integral and compound types of guaranteed size\r
+//\r
+//==============================================================================\r
+\r
+\r
+typedef               bool NcvBool;\r
+typedef          long long Ncv64s;\r
+typedef unsigned long long Ncv64u;\r
+typedef                int Ncv32s;\r
+typedef       unsigned int Ncv32u;\r
+typedef              short Ncv16s;\r
+typedef     unsigned short Ncv16u;\r
+typedef               char Ncv8s;\r
+typedef      unsigned char Ncv8u;\r
+typedef              float Ncv32f;\r
+typedef             double Ncv64f;\r
+\r
+\r
+typedef struct\r
+{\r
+    Ncv8u x;\r
+    Ncv8u y;\r
+    Ncv8u width;\r
+    Ncv8u height;\r
+} NcvRect8u;\r
+\r
+\r
+typedef struct\r
+{\r
+    Ncv32s x;          ///< x-coordinate of upper left corner.\r
+    Ncv32s y;          ///< y-coordinate of upper left corner.\r
+    Ncv32s width;      ///< Rectangle width.\r
+    Ncv32s height;     ///< Rectangle height.\r
+} NcvRect32s;\r
+\r
+\r
+typedef struct\r
+{\r
+    Ncv32u x;          ///< x-coordinate of upper left corner.\r
+    Ncv32u y;          ///< y-coordinate of upper left corner.\r
+    Ncv32u width;      ///< Rectangle width.\r
+    Ncv32u height;     ///< Rectangle height.\r
+} NcvRect32u;\r
+\r
+\r
+typedef struct \r
+{\r
+    Ncv32s width;  ///< Rectangle width.\r
+    Ncv32s height; ///< Rectangle height.\r
+} NcvSize32s;\r
+\r
+\r
+typedef struct \r
+{\r
+    Ncv32u width;  ///< Rectangle width.\r
+    Ncv32u height; ///< Rectangle height.\r
+} NcvSize32u;\r
+\r
+\r
+NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);\r
+NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);\r
+NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);\r
+NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);\r
+NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);\r
+NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);\r
+NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);\r
+NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);\r
+NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));\r
+NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));\r
+NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));\r
+NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Persistent constants\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u K_WARP_SIZE = 32;\r
+const Ncv32u K_LOG2_WARP_SIZE = 5;\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Error handling\r
+//\r
+//==============================================================================\r
+\r
+\r
+#define NCV_CT_PREP_STRINGIZE_AUX(x)    #x\r
+#define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)\r
+\r
+\r
+void ncvDebugOutput(const char *msg, ...);\r
+\r
+\r
+typedef void NCVDebugOutputHandler(const char* msg);\r
+\r
+\r
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);\r
+\r
+\r
+#define ncvAssertPrintCheck(pred, msg) \\r
+    ((pred) ? true : (ncvDebugOutput("\n%s\n", \\r
+    "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \\r
+    ), false))\r
+\r
+\r
+#define ncvAssertPrintReturn(pred, msg, err) \\r
+    if (ncvAssertPrintCheck(pred, msg)) ; else return err\r
+\r
+\r
+#define ncvAssertReturn(pred, err) \\r
+    do \\r
+    { \\r
+        if (!(pred)) \\r
+        { \\r
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+            return err; \\r
+        } \\r
+    } while (0)\r
+\r
+\r
+#define ncvAssertReturnNcvStat(ncvOp) \\r
+    do \\r
+    { \\r
+        NCVStatus _ncvStat = ncvOp; \\r
+        if (NCV_SUCCESS != _ncvStat) \\r
+        { \\r
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+            return _ncvStat; \\r
+        } \\r
+    } while (0)\r
+\r
+\r
+#define ncvAssertCUDAReturn(cudacall, errCode) \\r
+    do \\r
+    { \\r
+        cudaError_t resCall = cudacall; \\r
+        cudaError_t resGLE = cudaGetLastError(); \\r
+        if (cudaSuccess != resCall || cudaSuccess != resGLE) \\r
+        { \\r
+            ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \\r
+            return errCode; \\r
+        } \\r
+    } while (0)\r
+\r
+\r
+/**\r
+* Return-codes for status notification, errors and warnings\r
+*/\r
+enum NCVStatus\r
+{\r
+    NCV_SUCCESS,\r
+\r
+    NCV_CUDA_ERROR,\r
+    NCV_NPP_ERROR,\r
+    NCV_FILE_ERROR,\r
+\r
+    NCV_NULL_PTR,\r
+    NCV_INCONSISTENT_INPUT,\r
+    NCV_TEXTURE_BIND_ERROR,\r
+    NCV_DIMENSIONS_INVALID,\r
+\r
+    NCV_INVALID_ROI,\r
+    NCV_INVALID_STEP,\r
+    NCV_INVALID_SCALE,\r
+\r
+    NCV_ALLOCATOR_NOT_INITIALIZED,\r
+    NCV_ALLOCATOR_BAD_ALLOC,\r
+    NCV_ALLOCATOR_BAD_DEALLOC,\r
+    NCV_ALLOCATOR_INSUFFICIENT_CAPACITY,\r
+    NCV_ALLOCATOR_DEALLOC_ORDER,\r
+    NCV_ALLOCATOR_BAD_REUSE,\r
+\r
+    NCV_MEM_COPY_ERROR,\r
+    NCV_MEM_RESIDENCE_ERROR,\r
+    NCV_MEM_INSUFFICIENT_CAPACITY,\r
+\r
+    NCV_HAAR_INVALID_PIXEL_STEP,\r
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER,\r
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE,\r
+    NCV_HAAR_TOO_LARGE_FEATURES,\r
+    NCV_HAAR_XML_LOADING_EXCEPTION,\r
+\r
+    NCV_NOIMPL_HAAR_TILTED_FEATURES,\r
+\r
+    NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,\r
+};\r
+\r
+\r
+#define NCV_SET_SKIP_COND(x) \\r
+    bool __ncv_skip_cond = x\r
+\r
+\r
+#define NCV_RESET_SKIP_COND(x) \\r
+    __ncv_skip_cond = x\r
+\r
+\r
+#define NCV_SKIP_COND_BEGIN \\r
+    if (!__ncv_skip_cond) {\r
+\r
+\r
+#define NCV_SKIP_COND_END \\r
+    }\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Timer\r
+//\r
+//==============================================================================\r
+\r
+\r
+typedef struct _NcvTimer *NcvTimer;\r
+\r
+NcvTimer ncvStartTimer(void);\r
+\r
+double ncvEndQueryTimerUs(NcvTimer t);\r
+\r
+double ncvEndQueryTimerMs(NcvTimer t);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Memory management classes template compound types\r
+//\r
+//==============================================================================\r
+\r
+\r
+/**\r
+* Alignment of GPU memory chunks in bytes\r
+*/\r
+NCVStatus GPUAlignmentValue(Ncv32u &alignment);\r
+\r
+\r
+/**\r
+* Calculates the aligned top bound value\r
+*/\r
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment);\r
+\r
+\r
+/**\r
+* NCVMemoryType\r
+*/\r
+enum NCVMemoryType\r
+{\r
+    NCVMemoryTypeNone,\r
+    NCVMemoryTypeHostPageable,\r
+    NCVMemoryTypeHostPinned,\r
+    NCVMemoryTypeDevice\r
+};\r
+\r
+\r
+/**\r
+* NCVMemPtr\r
+*/\r
+struct NCVMemPtr\r
+{\r
+    void *ptr;\r
+    NCVMemoryType memtype;\r
+    void clear();\r
+};\r
+\r
+\r
+/**\r
+* NCVMemSegment\r
+*/\r
+struct NCVMemSegment\r
+{\r
+    NCVMemPtr begin;\r
+    size_t size;\r
+    void clear();\r
+};\r
+\r
+\r
+/**\r
+* INCVMemAllocator (Interface)\r
+*/\r
+class INCVMemAllocator\r
+{\r
+public:\r
+    virtual ~INCVMemAllocator() = 0;\r
+\r
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0;\r
+    virtual NCVStatus dealloc(NCVMemSegment &seg) = 0;\r
+\r
+    virtual NcvBool isInitialized(void) const = 0;\r
+    virtual NcvBool isCounting(void) const = 0;\r
+    \r
+    virtual NCVMemoryType memType(void) const = 0;\r
+    virtual Ncv32u alignment(void) const = 0;\r
+    virtual size_t maxSize(void) const = 0;\r
+};\r
+\r
+inline INCVMemAllocator::~INCVMemAllocator() {}\r
+\r
+\r
+/**\r
+* NCVMemStackAllocator\r
+*/\r
+class NCVMemStackAllocator : public INCVMemAllocator\r
+{\r
+    NCVMemStackAllocator();\r
+    NCVMemStackAllocator(const NCVMemStackAllocator &);\r
+\r
+public:\r
+\r
+    explicit NCVMemStackAllocator(Ncv32u alignment);\r
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);\r
+    virtual ~NCVMemStackAllocator();\r
+\r
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);\r
+    virtual NCVStatus dealloc(NCVMemSegment &seg);\r
+\r
+    virtual NcvBool isInitialized(void) const;\r
+    virtual NcvBool isCounting(void) const;\r
+\r
+    virtual NCVMemoryType memType(void) const;\r
+    virtual Ncv32u alignment(void) const;\r
+    virtual size_t maxSize(void) const;\r
+\r
+private:\r
+\r
+    NCVMemoryType _memType;\r
+    Ncv32u _alignment;\r
+    Ncv8u *allocBegin;\r
+    Ncv8u *begin;\r
+    Ncv8u *end;\r
+    size_t currentSize;\r
+    size_t _maxSize;\r
+};\r
+\r
+\r
+/**\r
+* NCVMemNativeAllocator\r
+*/\r
+class NCVMemNativeAllocator : public INCVMemAllocator\r
+{\r
+public:\r
+\r
+    NCVMemNativeAllocator(NCVMemoryType memT);\r
+    virtual ~NCVMemNativeAllocator();\r
+\r
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);\r
+    virtual NCVStatus dealloc(NCVMemSegment &seg);\r
+\r
+    virtual NcvBool isInitialized(void) const;\r
+    virtual NcvBool isCounting(void) const;\r
+\r
+    virtual NCVMemoryType memType(void) const;\r
+    virtual Ncv32u alignment(void) const;\r
+    virtual size_t maxSize(void) const;\r
+\r
+private:\r
+\r
+    NCVMemNativeAllocator();\r
+    NCVMemNativeAllocator(const NCVMemNativeAllocator &);\r
+\r
+    NCVMemoryType _memType;\r
+    Ncv32u _alignment;\r
+    size_t currentSize;\r
+    size_t _maxSize;\r
+};\r
+\r
+\r
+/**\r
+* Copy dispatcher\r
+*/\r
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,\r
+                           const void *src, NCVMemoryType srcType,\r
+                           size_t sz, cudaStream_t cuStream);\r
+\r
+\r
+/**\r
+* NCVVector (1D)\r
+*/\r
+template <class T>\r
+class NCVVector\r
+{\r
+    NCVVector(const NCVVector &);\r
+\r
+public:\r
+\r
+    NCVVector()\r
+    {\r
+        clear();\r
+    }\r
+\r
+       virtual ~NCVVector() {}\r
+\r
+    void clear()\r
+    {\r
+        _ptr = NULL;\r
+        _length = 0;\r
+        _memtype = NCVMemoryTypeNone;\r
+    }\r
+\r
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)\r
+    {\r
+        if (howMuch == 0)\r
+        {\r
+            ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR);\r
+            howMuch = this->_length * sizeof(T);\r
+        }\r
+        else\r
+        {\r
+            ncvAssertReturn(dst._length * sizeof(T) >= howMuch && \r
+                this->_length * sizeof(T) >= howMuch &&\r
+                howMuch > 0, NCV_MEM_COPY_ERROR);\r
+        }\r
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && \r
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);\r
+\r
+        NCVStatus ncvStat = NCV_SUCCESS;\r
+        if (this->_memtype != NCVMemoryTypeNone)\r
+        {\r
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,\r
+                                       this->_ptr, this->_memtype,\r
+                                       howMuch, cuStream);\r
+        }\r
+\r
+        return ncvStat;\r
+    }\r
+\r
+    T *ptr() const {return this->_ptr;}\r
+    size_t length() const {return this->_length;}\r
+    NCVMemoryType memType() const {return this->_memtype;}\r
+\r
+protected:\r
+\r
+    T *_ptr;\r
+    size_t _length;\r
+    NCVMemoryType _memtype;\r
+};\r
+\r
+\r
+/**\r
+* NCVVectorAlloc\r
+*/\r
+template <class T>\r
+class NCVVectorAlloc : public NCVVector<T>\r
+{\r
+    NCVVectorAlloc();\r
+    NCVVectorAlloc(const NCVVectorAlloc &);\r
+\r
+public:\r
+\r
+    NCVVectorAlloc(INCVMemAllocator &allocator, Ncv32u length)\r
+        :\r
+        allocator(allocator)\r
+    {\r
+        NCVStatus ncvStat;\r
+\r
+        this->clear();\r
+        this->allocatedMem.clear();\r
+\r
+        ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T));\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );\r
+\r
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;\r
+        this->_length = length;\r
+        this->_memtype = this->allocatedMem.begin.memtype;\r
+    }\r
+\r
+\r
+    ~NCVVectorAlloc()\r
+    {\r
+        NCVStatus ncvStat;\r
+\r
+        ncvStat = allocator.dealloc(this->allocatedMem);\r
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed");\r
+\r
+        this->clear();\r
+    }\r
+\r
+\r
+    NcvBool isMemAllocated() const\r
+    {\r
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());\r
+    }\r
+\r
+\r
+    Ncv32u getAllocatorsAlignment() const\r
+    {\r
+        return allocator.alignment();\r
+    }\r
+\r
+\r
+    NCVMemSegment getSegment() const\r
+    {\r
+        return allocatedMem;\r
+    }\r
+\r
+private:\r
+\r
+    INCVMemAllocator &allocator;\r
+    NCVMemSegment allocatedMem;\r
+};\r
+\r
+\r
+/**\r
+* NCVVectorReuse\r
+*/\r
+template <class T>\r
+class NCVVectorReuse : public NCVVector<T>\r
+{\r
+    NCVVectorReuse();\r
+    NCVVectorReuse(const NCVVectorReuse &);\r
+\r
+public:\r
+\r
+    explicit NCVVectorReuse(const NCVMemSegment &memSegment)\r
+    {\r
+        this->bReused = false;\r
+        this->clear();\r
+\r
+        this->_length = memSegment.size / sizeof(T);\r
+        this->_ptr = (T *)memSegment.begin.ptr;\r
+        this->_memtype = memSegment.begin.memtype;\r
+\r
+        this->bReused = true;\r
+    }\r
+\r
+\r
+    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)\r
+    {\r
+        this->bReused = false;\r
+        this->clear();\r
+\r
+        ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \\r
+            "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );\r
+\r
+        this->_length = length;\r
+        this->_ptr = (T *)memSegment.begin.ptr;\r
+        this->_memtype = memSegment.begin.memtype;\r
+\r
+        this->bReused = true;\r
+    }\r
+\r
+\r
+    NcvBool isMemReused() const\r
+    {\r
+        return this->bReused;\r
+    }\r
+\r
+private:\r
+\r
+    NcvBool bReused;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrix (2D)\r
+*/\r
+template <class T>\r
+class NCVMatrix\r
+{\r
+    NCVMatrix(const NCVMatrix &);\r
+\r
+public:\r
+\r
+    NCVMatrix()\r
+    {\r
+        clear();\r
+    }\r
+\r
+       virtual ~NCVMatrix() {}\r
+\r
+\r
+    void clear()\r
+    {\r
+        _ptr = NULL;\r
+        _pitch = 0;\r
+        _width = 0;\r
+        _height = 0;\r
+        _memtype = NCVMemoryTypeNone;\r
+    }\r
+\r
+\r
+    Ncv32u stride() const\r
+    {\r
+        return _pitch / sizeof(T);\r
+    }\r
+\r
+\r
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)\r
+    {\r
+        if (howMuch == 0)\r
+        {\r
+            ncvAssertReturn(dst._pitch == this->_pitch &&\r
+                            dst._height == this->_height, NCV_MEM_COPY_ERROR);\r
+            howMuch = this->_pitch * this->_height;\r
+        }\r
+        else\r
+        {\r
+            ncvAssertReturn(dst._pitch * dst._height >= howMuch && \r
+                            this->_pitch * this->_height >= howMuch &&\r
+                            howMuch > 0, NCV_MEM_COPY_ERROR);\r
+        }\r
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && \r
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);\r
+\r
+        NCVStatus ncvStat = NCV_SUCCESS;\r
+        if (this->_memtype != NCVMemoryTypeNone)\r
+        {\r
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, \r
+                                       this->_ptr, this->_memtype, \r
+                                       howMuch, cuStream);\r
+        }\r
+\r
+        return ncvStat;\r
+    }\r
+\r
+    T *ptr() const {return this->_ptr;}\r
+    Ncv32u width() const {return this->_width;}\r
+    Ncv32u height() const {return this->_height;}\r
+    Ncv32u pitch() const {return this->_pitch;}\r
+    NCVMemoryType memType() const {return this->_memtype;}\r
+\r
+protected:\r
+\r
+    T *_ptr;\r
+    Ncv32u _width;\r
+    Ncv32u _height;\r
+    Ncv32u _pitch;\r
+    NCVMemoryType _memtype;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrixAlloc\r
+*/\r
+template <class T>\r
+class NCVMatrixAlloc : public NCVMatrix<T>\r
+{\r
+    NCVMatrixAlloc();\r
+    NCVMatrixAlloc(const NCVMatrixAlloc &);\r
+\r
+public:\r
+\r
+    NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)\r
+        :\r
+        allocator(allocator)\r
+    {\r
+        NCVStatus ncvStat;\r
+\r
+        this->clear();\r
+        this->allocatedMem.clear();\r
+\r
+        Ncv32u widthBytes = width * sizeof(T);\r
+        Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());\r
+\r
+        if (pitch != 0)\r
+        {\r
+            ncvAssertPrintReturn(pitch >= pitchBytes &&\r
+                (pitch & (allocator.alignment() - 1)) == 0,\r
+                "NCVMatrixAlloc ctor:: incorrect pitch passed", );\r
+            pitchBytes = pitch;\r
+        }\r
+\r
+        Ncv32u requiredAllocSize = pitchBytes * height;\r
+\r
+        ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);\r
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );\r
+\r
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;\r
+        this->_width = width;\r
+        this->_height = height;\r
+        this->_pitch = pitchBytes;\r
+        this->_memtype = this->allocatedMem.begin.memtype;\r
+    }\r
+\r
+    ~NCVMatrixAlloc()\r
+    {\r
+        NCVStatus ncvStat;\r
+\r
+        ncvStat = allocator.dealloc(this->allocatedMem);\r
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed");\r
+\r
+        this->clear();\r
+    }\r
+\r
+\r
+    NcvBool isMemAllocated() const\r
+    {\r
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());\r
+    }\r
+\r
+\r
+    Ncv32u getAllocatorsAlignment() const\r
+    {\r
+        return allocator.alignment();\r
+    }\r
+\r
+\r
+    NCVMemSegment getSegment() const\r
+    {\r
+        return allocatedMem;\r
+    }\r
+\r
+private:\r
+\r
+    INCVMemAllocator &allocator;\r
+    NCVMemSegment allocatedMem;\r
+};\r
+\r
+\r
+/**\r
+* NCVMatrixReuse\r
+*/\r
+template <class T>\r
+class NCVMatrixReuse : public NCVMatrix<T>\r
+{\r
+    NCVMatrixReuse();\r
+    NCVMatrixReuse(const NCVMatrixReuse &);\r
+\r
+public:\r
+\r
+    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false)\r
+    {\r
+        this->bReused = false;\r
+        this->clear();\r
+\r
+        Ncv32u widthBytes = width * sizeof(T);\r
+        Ncv32u pitchBytes = alignUp(widthBytes, alignment);\r
+\r
+        if (pitch != 0)\r
+        {\r
+            if (!bSkipPitchCheck)\r
+            {\r
+                ncvAssertPrintReturn(pitch >= pitchBytes &&\r
+                    (pitch & (alignment - 1)) == 0,\r
+                    "NCVMatrixReuse ctor:: incorrect pitch passed", );\r
+            }\r
+            else\r
+            {\r
+                ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );\r
+            }\r
+            pitchBytes = pitch;\r
+        }\r
+\r
+        ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \\r
+            "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );\r
+\r
+        this->_width = width;\r
+        this->_height = height;\r
+        this->_pitch = pitchBytes;\r
+        this->_ptr = (T *)memSegment.begin.ptr;\r
+        this->_memtype = memSegment.begin.memtype;\r
+\r
+        this->bReused = true;\r
+    }\r
+\r
+\r
+    NcvBool isMemReused() const\r
+    {\r
+        return this->bReused;\r
+    }\r
+\r
+private:\r
+\r
+    NcvBool bReused;\r
+};\r
+\r
+#endif // _ncv_hpp_\r
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
new file mode 100644 (file)
index 0000000..a501d65
--- /dev/null
@@ -0,0 +1,2603 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework\r
+//\r
+// The algorithm and code are explained in the upcoming GPU Computing Gems\r
+// chapter in detail:\r
+//\r
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"\r
+//   PDF URL placeholder\r
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com\r
+//\r
+// Credits for help with the code to:\r
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.\r
+//\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#include <algorithm>\r
+\r
+#include "npp.h"\r
+#include "NCV.hpp"\r
+#include "NCVRuntimeTemplates.hpp"\r
+#include "NCVHaarObjectDetection.hpp"\r
+\r
+void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// BlockScan file\r
+//\r
+//==============================================================================\r
+\r
+\r
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
+//assuming size <= WARP_SIZE and size is power of 2\r
+template <class T>\r
+inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
+{\r
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
+    s_Data[pos] = 0;\r
+    pos += K_WARP_SIZE;\r
+    s_Data[pos] = idata;\r
+\r
+    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+    {\r
+        s_Data[pos] += s_Data[pos - offset];\r
+    }\r
+\r
+    return s_Data[pos];\r
+}\r
+\r
+\r
+template <class T>\r
+inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
+{\r
+    return warpScanInclusive(idata, s_Data) - idata;\r
+}\r
+\r
+\r
+template <class T, Ncv32u tiNumScanThreads>\r
+inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
+{\r
+    if (tiNumScanThreads > K_WARP_SIZE)\r
+    {\r
+        //Bottom-level inclusive warp scan\r
+        T warpResult = warpScanInclusive(idata, s_Data);\r
+\r
+        //Save top elements of each warp for exclusive warp scan\r
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
+        __syncthreads();\r
+        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
+        {\r
+            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
+        }\r
+\r
+        //wait for warp scans to complete\r
+        __syncthreads();\r
+\r
+        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
+        {\r
+            //grab top warp elements\r
+            T val = s_Data[threadIdx.x];\r
+            //calculate exclusive scan and write back to shared memory\r
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
+        }\r
+\r
+        //return updated warp scans with exclusive scan results\r
+        __syncthreads();\r
+        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];\r
+    }\r
+    else\r
+    {\r
+        return warpScanInclusive(idata, s_Data);\r
+    }\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// HaarClassifierCascade file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u MAX_GRID_DIM = 65535;\r
+\r
+\r
+const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;\r
+\r
+\r
+#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6\r
+#define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)\r
+\r
+\r
+/** \internal\r
+* Haar features solid array.\r
+*/\r
+texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;\r
+\r
+\r
+/** \internal\r
+* Haar classifiers flattened trees container.\r
+* Two parts: first contains root nodes, second - nodes that are referred by root nodes.\r
+* Drawback: breaks tree locality (might cause more cache misses\r
+* Advantage: No need to introduce additional 32-bit field to index root nodes offsets\r
+*/\r
+texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;\r
+\r
+\r
+texture<Ncv32u, 1, cudaReadModeElementType> texIImage;\r
+\r
+\r
+__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)\r
+{\r
+    return d_Stages[iStage];\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureCascade>\r
+__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)\r
+{\r
+    HaarClassifierNode128 tmpNode;\r
+    if (tbCacheTextureCascade)\r
+    {\r
+        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);\r
+    }\r
+    else\r
+    {\r
+        tmpNode = d_ClassifierNodes[iNode];\r
+    }\r
+    return tmpNode;\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureCascade>\r
+__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, \r
+                           Ncv32f *weight,\r
+                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)\r
+{\r
+    HaarFeature64 feature;\r
+    if (tbCacheTextureCascade)\r
+    {\r
+        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);\r
+    }\r
+    else\r
+    {\r
+        feature = d_Features[iFeature];\r
+    }\r
+    feature.getRect(rectX, rectY, rectWidth, rectHeight);\r
+    *weight = feature.getWeight();\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureIImg>\r
+__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)\r
+{\r
+    if (tbCacheTextureIImg)\r
+    {\r
+        return tex1Dfetch(texIImage, x);\r
+    }\r
+    else\r
+    {\r
+        return d_IImg[x];\r
+    }\r
+}\r
+\r
+\r
+__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)\r
+{\r
+    __shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];\r
+    reductor[threadIdx.x] = partialSum;\r
+    __syncthreads();\r
+\r
+#if defined CPU_FP_COMPLIANCE\r
+    if (!threadIdx.x)\r
+    {\r
+        Ncv32f sum = 0.0f;\r
+        for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)\r
+        {\r
+            sum += reductor[i];\r
+        }\r
+        reductor[0] = sum;\r
+    }\r
+#else\r
+\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8\r
+    if (threadIdx.x < 128)\r
+    {\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 128]; \r
+    }\r
+    __syncthreads();\r
+#endif\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7\r
+    if (threadIdx.x < 64)\r
+    {\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 64]; \r
+    }\r
+    __syncthreads();\r
+#endif\r
+\r
+    if (threadIdx.x < 32)\r
+    {\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 32];\r
+#endif\r
+#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 16];\r
+#endif\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 8];\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 4];\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 2];\r
+        reductor[threadIdx.x] += reductor[threadIdx.x + 1];\r
+    }\r
+#endif\r
+\r
+    __syncthreads();\r
+\r
+    return reductor[0];\r
+}\r
+\r
+\r
+__device__ Ncv32u d_outMaskPosition;\r
+\r
+\r
+__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag,\r
+                                                            Ncv32u threadElem,\r
+                                                            Ncv32u *vectorOut)\r
+{\r
+#if __CUDA_ARCH__ >= 110\r
+    Ncv32u passMaskElem = threadPassFlag ? 1 : 0;\r
+    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];\r
+    Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);\r
+    __syncthreads();\r
+    Ncv32u excScan = incScan - passMaskElem;\r
+\r
+    __shared__ Ncv32u numPassed;\r
+    __shared__ Ncv32u outMaskOffset;\r
+    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)\r
+    {\r
+        numPassed = incScan;\r
+        outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);\r
+    }\r
+    __syncthreads();\r
+\r
+    if (threadPassFlag)\r
+    {\r
+        shmem[excScan] = threadElem;\r
+    }\r
+    __syncthreads();\r
+\r
+    if (threadIdx.x < numPassed)\r
+    {\r
+        vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];\r
+    }\r
+#endif\r
+}\r
+\r
+\r
+template <NcvBool tbInitMaskPositively,\r
+          NcvBool tbCacheTextureIImg,\r
+          NcvBool tbCacheTextureCascade,\r
+          NcvBool tbReadPixelIndexFromVector,\r
+          NcvBool tbDoAtomicCompaction>\r
+__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,\r
+                                                  Ncv32f *d_weights, Ncv32u weightsStride,\r
+                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+    Ncv32u y_offs;\r
+    Ncv32u x_offs;\r
+    Ncv32u maskOffset;\r
+    Ncv32u outMaskVal;\r
+\r
+    NcvBool bInactiveThread = false;\r
+\r
+    if (tbReadPixelIndexFromVector)\r
+    {\r
+        maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+\r
+        if (maskOffset >= mask1Dlen)\r
+        {\r
+            if (tbDoAtomicCompaction) bInactiveThread = true; else return;\r
+        }\r
+\r
+        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+        {\r
+            outMaskVal = d_inMask[maskOffset];\r
+            y_offs = outMaskVal >> 16;\r
+            x_offs = outMaskVal & 0xFFFF;\r
+        }\r
+    }\r
+    else\r
+    {\r
+        y_offs = blockIdx.y;\r
+        x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+\r
+        if (x_offs >= mask2Dstride)\r
+        {\r
+            if (tbDoAtomicCompaction) bInactiveThread = true; else return;\r
+        }\r
+\r
+        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+        {\r
+            maskOffset = y_offs * mask2Dstride + x_offs;\r
+\r
+            if ((x_offs >= anchorsRoi.width) ||\r
+                (!tbInitMaskPositively &&\r
+                 d_inMask != d_outMask &&\r
+                 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))\r
+            {\r
+                if (tbDoAtomicCompaction) \r
+                {\r
+                    bInactiveThread = true; \r
+                }\r
+                else\r
+                {\r
+                    d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+                    return;\r
+                }\r
+            }\r
+\r
+            outMaskVal = (y_offs << 16) | x_offs;\r
+        }\r
+    }\r
+\r
+    NcvBool bPass = true;\r
+\r
+    if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+    {\r
+        Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+\r
+        for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+        {\r
+            Ncv32f curStageSum = 0.0f;\r
+\r
+            HaarStage64 curStage = getStage(iStage, d_Stages);\r
+            Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();\r
+            Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();\r
+            Ncv32f stageThreshold = curStage.getStageThreshold();\r
+\r
+            while (numRootNodesInStage--)\r
+            {\r
+                NcvBool bMoreNodesToTraverse = true;\r
+                Ncv32u iNode = curRootNodeOffset;\r
+\r
+                while (bMoreNodesToTraverse)\r
+                {\r
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
+\r
+                    Ncv32f curNodeVal = 0.0f;\r
+\r
+                    for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+                    {\r
+                        Ncv32f rectWeight;\r
+                        Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+                        getFeature<tbCacheTextureCascade>\r
+                            (iFeature + iRect, d_Features,\r
+                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+                        Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+                        Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+                    curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+#else\r
+                    curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+#endif\r
+                    }\r
+\r
+                    HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+                    HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+                    Ncv32f nodeThreshold = curNode.getThreshold();\r
+                    HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+                    nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;\r
+\r
+                    if (nextNodeDescriptor.isLeaf())\r
+                    {\r
+                        Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+                        curStageSum += tmpLeafValue;\r
+                        bMoreNodesToTraverse = false;\r
+                    }\r
+                    else\r
+                    {\r
+                        iNode = nextNodeDescriptor.getNextNodeOffset();\r
+                    }\r
+                }\r
+\r
+                __syncthreads();\r
+                curRootNodeOffset++;\r
+            }\r
+\r
+            if (curStageSum < stageThreshold)\r
+            {\r
+                bPass = false;\r
+                outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+                break;\r
+            }\r
+        }\r
+    }\r
+\r
+    __syncthreads();\r
+\r
+    if (!tbDoAtomicCompaction)\r
+    {\r
+        if (!tbReadPixelIndexFromVector ||\r
+            (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))\r
+        {\r
+            d_outMask[maskOffset] = outMaskVal;\r
+        }\r
+    }\r
+    else\r
+    {\r
+        compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,\r
+                                           outMaskVal,\r
+                                           d_outMask);\r
+    }\r
+}\r
+\r
+\r
+template <NcvBool tbCacheTextureIImg,\r
+          NcvBool tbCacheTextureCascade,\r
+          NcvBool tbDoAtomicCompaction>\r
+__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,\r
+                                                      Ncv32f *d_weights, Ncv32u weightsStride,\r
+                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+    Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;\r
+\r
+    if (maskOffset >= mask1Dlen)\r
+    {\r
+        return;\r
+    }\r
+\r
+    Ncv32u outMaskVal = d_inMask[maskOffset];\r
+    Ncv32u y_offs = outMaskVal >> 16;\r
+    Ncv32u x_offs = outMaskVal & 0xFFFF;\r
+\r
+    Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+    NcvBool bPass = true;\r
+\r
+    for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+    {\r
+        //this variable is subject to reduction\r
+        Ncv32f curStageSum = 0.0f;\r
+\r
+        HaarStage64 curStage = getStage(iStage, d_Stages);\r
+        Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();\r
+        Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;\r
+        Ncv32f stageThreshold = curStage.getStageThreshold();\r
+\r
+        Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;\r
+\r
+        for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)\r
+        {\r
+            NcvBool bMoreNodesToTraverse = true;\r
+\r
+            if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)\r
+            {\r
+                Ncv32u iNode = curRootNodeOffset;\r
+\r
+                while (bMoreNodesToTraverse)\r
+                {\r
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
+\r
+                    Ncv32f curNodeVal = 0.0f;\r
+                    //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce\r
+                    for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+                    {\r
+                        Ncv32f rectWeight;\r
+                        Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+                        getFeature<tbCacheTextureCascade>\r
+                            (iFeature + iRect, d_Features,\r
+                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+                        Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+                        Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+#else\r
+                        curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+#endif\r
+                    }\r
+\r
+                    HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+                    HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+                    Ncv32f nodeThreshold = curNode.getThreshold();\r
+                    HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+                    nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;\r
+\r
+                    if (nextNodeDescriptor.isLeaf())\r
+                    {\r
+                        Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+                        curStageSum += tmpLeafValue;\r
+                        bMoreNodesToTraverse = false;\r
+                    }\r
+                    else\r
+                    {\r
+                        iNode = nextNodeDescriptor.getNextNodeOffset();\r
+                    }\r
+                }\r
+            }\r
+            __syncthreads();\r
+\r
+            curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;\r
+        }\r
+\r
+        Ncv32f finalStageSum = reduceSpecialization(curStageSum);\r
+\r
+        if (finalStageSum < stageThreshold)\r
+        {\r
+            bPass = false;\r
+            outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+            break;\r
+        }\r
+    }\r
+\r
+    if (!tbDoAtomicCompaction)\r
+    {\r
+        if (!bPass || d_inMask != d_outMask)\r
+        {\r
+            if (!threadIdx.x)\r
+            {\r
+                d_outMask[maskOffset] = outMaskVal;\r
+            }\r
+        }\r
+    }\r
+    else\r
+    {\r
+#if __CUDA_ARCH__ >= 110\r
+        if (bPass && !threadIdx.x)\r
+        {\r
+            Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);\r
+            d_outMask[outMaskOffset] = outMaskVal;\r
+        }\r
+#endif\r
+    }\r
+}\r
+\r
+\r
+template <NcvBool tbMaskByInmask,\r
+          NcvBool tbDoAtomicCompaction>\r
+__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                     NcvSize32u anchorsRoi, Ncv32u step)\r
+{\r
+    Ncv32u y_offs = blockIdx.y;\r
+    Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;\r
+    Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;\r
+\r
+    Ncv32u y_offs_upsc = step * y_offs;\r
+    Ncv32u x_offs_upsc = step * x_offs;\r
+    Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;\r
+\r
+    Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+\r
+    if (x_offs_upsc < anchorsRoi.width &&\r
+        (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))\r
+    {\r
+        outElem = (y_offs_upsc << 16) | x_offs_upsc;\r
+    }\r
+\r
+    if (!tbDoAtomicCompaction)\r
+    {\r
+        d_outMask[outMaskOffset] = outElem;\r
+    }\r
+    else\r
+    {\r
+        compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,\r
+                                           outElem,\r
+                                           d_outMask);\r
+    }\r
+}\r
+\r
+\r
+struct applyHaarClassifierAnchorParallelFunctor\r
+{\r
+    dim3 gridConf, blockConf;\r
+    cudaStream_t cuStream;\r
+\r
+    //Kernel arguments are stored as members;\r
+    Ncv32u *d_IImg;\r
+    Ncv32u IImgStride;\r
+    Ncv32f *d_weights;\r
+    Ncv32u weightsStride;\r
+    HaarFeature64 *d_Features;\r
+    HaarClassifierNode128 *d_ClassifierNodes;\r
+    HaarStage64 *d_Stages;\r
+    Ncv32u *d_inMask;\r
+    Ncv32u *d_outMask;\r
+    Ncv32u mask1Dlen;\r
+    Ncv32u mask2Dstride;\r
+    NcvSize32u anchorsRoi;\r
+    Ncv32u startStageInc;\r
+    Ncv32u endStageExc;\r
+    Ncv32f scaleArea;\r
+\r
+    //Arguments are passed through the constructor\r
+    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,\r
+                                             Ncv32f *_d_weights, Ncv32u _weightsStride,\r
+                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,\r
+                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,\r
+                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :\r
+    gridConf(_gridConf),\r
+    blockConf(_blockConf),\r
+    cuStream(_cuStream),\r
+    d_IImg(_d_IImg),\r
+    IImgStride(_IImgStride),\r
+    d_weights(_d_weights),\r
+    weightsStride(_weightsStride),\r
+    d_Features(_d_Features),\r
+    d_ClassifierNodes(_d_ClassifierNodes),\r
+    d_Stages(_d_Stages),\r
+    d_inMask(_d_inMask),\r
+    d_outMask(_d_outMask),\r
+    mask1Dlen(_mask1Dlen),\r
+    mask2Dstride(_mask2Dstride),\r
+    anchorsRoi(_anchorsRoi),\r
+    startStageInc(_startStageInc),\r
+    endStageExc(_endStageExc),\r
+    scaleArea(_scaleArea)\r
+    {}\r
+\r
+    template<class TList>\r
+    void call(TList tl)\r
+    {\r
+        applyHaarClassifierAnchorParallel <\r
+            Loki::TL::TypeAt<TList, 0>::Result::value,\r
+            Loki::TL::TypeAt<TList, 1>::Result::value,\r
+            Loki::TL::TypeAt<TList, 2>::Result::value,\r
+            Loki::TL::TypeAt<TList, 3>::Result::value,\r
+            Loki::TL::TypeAt<TList, 4>::Result::value >\r
+            <<<gridConf, blockConf, 0, cuStream>>>\r
+            (d_IImg, IImgStride,\r
+            d_weights, weightsStride,\r
+            d_Features, d_ClassifierNodes, d_Stages,\r
+            d_inMask, d_outMask,\r
+            mask1Dlen, mask2Dstride,\r
+            anchorsRoi, startStageInc,\r
+            endStageExc, scaleArea);\r
+    }\r
+};\r
+\r
+\r
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,\r
+                                                  NcvBool tbCacheTextureIImg,\r
+                                                  NcvBool tbCacheTextureCascade,\r
+                                                  NcvBool tbReadPixelIndexFromVector,\r
+                                                  NcvBool tbDoAtomicCompaction,\r
+\r
+                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+                                                  Ncv32u *d_IImg, Ncv32u IImgStride,\r
+                                                  Ncv32f *d_weights, Ncv32u weightsStride,\r
+                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,\r
+                                                  Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+    //Second parameter is the number of "dynamic" template parameters\r
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>\r
+        ::call( applyHaarClassifierAnchorParallelFunctor(gridConf, blockConf, cuStream,\r
+                                                         d_IImg, IImgStride,\r
+                                                         d_weights, weightsStride,\r
+                                                         d_Features, d_ClassifierNodes, d_Stages,\r
+                                                         d_inMask, d_outMask,\r
+                                                         mask1Dlen, mask2Dstride,\r
+                                                         anchorsRoi, startStageInc,\r
+                                                         endStageExc, scaleArea),\r
+        0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+        tbInitMaskPositively,\r
+        tbCacheTextureIImg,\r
+        tbCacheTextureCascade,\r
+        tbReadPixelIndexFromVector,\r
+        tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+struct applyHaarClassifierClassifierParallelFunctor\r
+{\r
+    dim3 gridConf, blockConf;\r
+    cudaStream_t cuStream;\r
+\r
+    //Kernel arguments are stored as members;\r
+    Ncv32u *d_IImg;\r
+    Ncv32u IImgStride;\r
+    Ncv32f *d_weights;\r
+    Ncv32u weightsStride;\r
+    HaarFeature64 *d_Features;\r
+    HaarClassifierNode128 *d_ClassifierNodes;\r
+    HaarStage64 *d_Stages;\r
+    Ncv32u *d_inMask;\r
+    Ncv32u *d_outMask;\r
+    Ncv32u mask1Dlen;\r
+    Ncv32u mask2Dstride;\r
+    NcvSize32u anchorsRoi;\r
+    Ncv32u startStageInc;\r
+    Ncv32u endStageExc;\r
+    Ncv32f scaleArea;\r
+\r
+    //Arguments are passed through the constructor\r
+    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,\r
+                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,\r
+                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,\r
+                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,\r
+                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :\r
+    gridConf(_gridConf),\r
+    blockConf(_blockConf),\r
+    cuStream(_cuStream),\r
+    d_IImg(_d_IImg),\r
+    IImgStride(_IImgStride),\r
+    d_weights(_d_weights),\r
+    weightsStride(_weightsStride),\r
+    d_Features(_d_Features),\r
+    d_ClassifierNodes(_d_ClassifierNodes),\r
+    d_Stages(_d_Stages),\r
+    d_inMask(_d_inMask),\r
+    d_outMask(_d_outMask),\r
+    mask1Dlen(_mask1Dlen),\r
+    mask2Dstride(_mask2Dstride),\r
+    anchorsRoi(_anchorsRoi),\r
+    startStageInc(_startStageInc),\r
+    endStageExc(_endStageExc),\r
+    scaleArea(_scaleArea)\r
+    {}\r
+\r
+    template<class TList>\r
+    void call(TList tl)\r
+    {\r
+        applyHaarClassifierClassifierParallel <\r
+            Loki::TL::TypeAt<TList, 0>::Result::value,\r
+            Loki::TL::TypeAt<TList, 1>::Result::value,\r
+            Loki::TL::TypeAt<TList, 2>::Result::value >\r
+            <<<gridConf, blockConf, 0, cuStream>>>\r
+            (d_IImg, IImgStride,\r
+            d_weights, weightsStride,\r
+            d_Features, d_ClassifierNodes, d_Stages,\r
+            d_inMask, d_outMask,\r
+            mask1Dlen, mask2Dstride,\r
+            anchorsRoi, startStageInc,\r
+            endStageExc, scaleArea);\r
+    }\r
+};\r
+\r
+\r
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,\r
+                                                      NcvBool tbCacheTextureCascade,\r
+                                                      NcvBool tbDoAtomicCompaction,\r
+\r
+                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+                                                      Ncv32u *d_IImg, Ncv32u IImgStride,\r
+                                                      Ncv32f *d_weights, Ncv32u weightsStride,\r
+                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,\r
+                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,\r
+                                                      Ncv32u endStageExc, Ncv32f scaleArea)\r
+{\r
+    //Second parameter is the number of "dynamic" template parameters\r
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>\r
+        ::call( applyHaarClassifierClassifierParallelFunctor(gridConf, blockConf, cuStream,\r
+                                                             d_IImg, IImgStride,\r
+                                                             d_weights, weightsStride,\r
+                                                             d_Features, d_ClassifierNodes, d_Stages,\r
+                                                             d_inMask, d_outMask,\r
+                                                             mask1Dlen, mask2Dstride,\r
+                                                             anchorsRoi, startStageInc,\r
+                                                             endStageExc, scaleArea),\r
+        0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+        tbCacheTextureIImg,\r
+        tbCacheTextureCascade,\r
+        tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+struct initializeMaskVectorFunctor\r
+{\r
+    dim3 gridConf, blockConf;\r
+    cudaStream_t cuStream;\r
+\r
+    //Kernel arguments are stored as members;\r
+    Ncv32u *d_inMask;\r
+    Ncv32u *d_outMask;\r
+    Ncv32u mask1Dlen;\r
+    Ncv32u mask2Dstride;\r
+    NcvSize32u anchorsRoi;\r
+    Ncv32u step;\r
+\r
+    //Arguments are passed through the constructor\r
+    initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,\r
+                                Ncv32u *_d_inMask, Ncv32u *_d_outMask,\r
+                                Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,\r
+                                NcvSize32u _anchorsRoi, Ncv32u _step) :\r
+    gridConf(_gridConf),\r
+    blockConf(_blockConf),\r
+    cuStream(_cuStream),\r
+    d_inMask(_d_inMask),\r
+    d_outMask(_d_outMask),\r
+    mask1Dlen(_mask1Dlen),\r
+    mask2Dstride(_mask2Dstride),\r
+    anchorsRoi(_anchorsRoi),\r
+    step(_step)\r
+    {}\r
+\r
+    template<class TList>\r
+    void call(TList tl)\r
+    {\r
+        initializeMaskVector <\r
+            Loki::TL::TypeAt<TList, 0>::Result::value,\r
+            Loki::TL::TypeAt<TList, 1>::Result::value >\r
+            <<<gridConf, blockConf, 0, cuStream>>>\r
+            (d_inMask, d_outMask,\r
+             mask1Dlen, mask2Dstride,\r
+             anchorsRoi, step);\r
+    }\r
+};\r
+\r
+\r
+void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,\r
+                                     NcvBool tbDoAtomicCompaction,\r
+\r
+                                     dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,\r
+\r
+                                     Ncv32u *d_inMask, Ncv32u *d_outMask,\r
+                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,\r
+                                     NcvSize32u anchorsRoi, Ncv32u step)\r
+{\r
+    //Second parameter is the number of "dynamic" template parameters\r
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>\r
+        ::call( initializeMaskVectorFunctor(gridConf, blockConf, cuStream,\r
+                                            d_inMask, d_outMask,\r
+                                            mask1Dlen, mask2Dstride,\r
+                                            anchorsRoi, step),\r
+        0xC001C0DE, //this is dummy int for the va_args C compatibility\r
+        tbMaskByInmask,\r
+        tbDoAtomicCompaction);\r
+}\r
+\r
+\r
+Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,\r
+                                              NCVVector<HaarStage64> &h_HaarStages)\r
+{\r
+    Ncv32u i = 0;\r
+    for (; i<haar.NumStages; i++)\r
+    {\r
+        if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)\r
+        {\r
+            break;\r
+        }\r
+    }\r
+    return i;\r
+}\r
+\r
+\r
+template <class T>\r
+void swap(T &p1, T &p2)\r
+{\r
+    T tmp = p1;\r
+    p1 = p2;\r
+    p2 = tmp;\r
+}\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,\r
+                                               NCVMatrix<Ncv32f> &d_weights,\r
+                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,\r
+                                               Ncv32u &numDetections,\r
+                                               HaarClassifierCascadeDescriptor &haar,\r
+                                               NCVVector<HaarStage64> &h_HaarStages,\r
+                                               NCVVector<HaarStage64> &d_HaarStages,\r
+                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+                                               NCVVector<HaarFeature64> &d_HaarFeatures,\r
+                                               NcvBool bMaskElements,\r
+                                               NcvSize32u anchorsRoi,\r
+                                               Ncv32u pixelStep,\r
+                                               Ncv32f scaleArea,\r
+                                               INCVMemAllocator &gpuAllocator,\r
+                                               INCVMemAllocator &cpuAllocator,\r
+                                               Ncv32u devPropMajor,\r
+                                               Ncv32u devPropMinor,\r
+                                               cudaStream_t cuStream)\r
+{\r
+    ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&\r
+                    d_integralImage.memType() == d_pixelMask.memType() &&\r
+                    d_integralImage.memType() == gpuAllocator.memType() &&\r
+                     (d_integralImage.memType() == NCVMemoryTypeDevice ||\r
+                      d_integralImage.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&\r
+                    d_HaarStages.memType() == d_HaarFeatures.memType() &&\r
+                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||\r
+                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);\r
+    ncvAssertReturn((d_integralImage.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&\r
+                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&\r
+                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);\r
+    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&\r
+                    d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&\r
+                    d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&\r
+                    d_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&\r
+                    d_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);\r
+    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&\r
+                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+                    d_HaarFeatures.length() >= haar.NumFeatures &&\r
+                    d_HaarStages.length() == h_HaarStages.length() &&\r
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+#if defined _SELF_TEST_\r
+\r
+    NCVStatus ncvStat;\r
+\r
+    NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, d_integralImage.width, d_integralImage.height, d_integralImage.pitch);\r
+    ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);\r
+    ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);\r
+    ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);\r
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);\r
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);\r
+    ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+\r
+    ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvStat = d_integralImage.copySolid(h_integralImage, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvStat = d_weights.copySolid(h_weights, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);\r
+\r
+    for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)\r
+    {\r
+        for (Ncv32u j=0; j<d_pixelMask.stride(); j++)\r
+        {\r
+            if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))\r
+            {\r
+                if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)\r
+                {\r
+                    h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;\r
+                }\r
+            }\r
+            else\r
+            {\r
+                h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+            }\r
+        }\r
+    }\r
+\r
+    NCV_SKIP_COND_END\r
+\r
+#endif\r
+\r
+    NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());\r
+    ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);\r
+\r
+    NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, d_vecPixelMask.length());\r
+    ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);\r
+    ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    Ncv32u *hp_zero = &hp_pool32u.ptr()[0];\r
+    Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+    *hp_zero = 0;\r
+    *hp_numDet = 0;\r
+    NCV_SKIP_COND_END\r
+\r
+    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *\r
+                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));\r
+\r
+    NcvBool bTexCacheCascade = devPropMajor < 2;\r
+    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far\r
+    NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3);\r
+\r
+    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;\r
+    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;\r
+\r
+    Ncv32u szNppCompactTmpBuf;\r
+    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf);\r
+    if (bDoAtomicCompaction)\r
+    {\r
+        szNppCompactTmpBuf = 0;\r
+    }\r
+    NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+\r
+    if (bTexCacheIImg)\r
+    {\r
+        cudaChannelFormatDesc cfdTexIImage;\r
+        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();\r
+\r
+        size_t alignmentOffset;\r
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, d_integralImage.ptr(), cfdTexIImage,\r
+            (anchorsRoi.height + haar.ClassifierSize.height) * d_integralImage.pitch()), NCV_CUDA_ERROR);\r
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+    }\r
+\r
+    if (bTexCacheCascade)\r
+    {\r
+        cudaChannelFormatDesc cfdTexHaarFeatures;\r
+        cudaChannelFormatDesc cfdTexHaarClassifierNodes;\r
+        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();\r
+        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();\r
+\r
+        size_t alignmentOffset;\r
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,\r
+            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);\r
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,\r
+            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);\r
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);\r
+    }\r
+\r
+    Ncv32u stageStartAnchorParallel = 0;\r
+    Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,\r
+        haar, h_HaarStages);\r
+    Ncv32u stageEndClassifierParallel = haar.NumStages;\r
+    if (stageMiddleSwitch == 0)\r
+    {\r
+        stageMiddleSwitch = 1;\r
+    }\r
+\r
+    //create stages subdivision for pixel-parallel processing\r
+    const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;\r
+    Ncv32u curStop = stageStartAnchorParallel;\r
+    std::vector<Ncv32u> pixParallelStageStops;\r
+    while (curStop < stageMiddleSwitch)\r
+    {\r
+        pixParallelStageStops.push_back(curStop);\r
+        curStop += compactEveryNstage;\r
+    }\r
+    if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)\r
+    {\r
+        pixParallelStageStops[pixParallelStageStops.size()-1] = \r
+            (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;\r
+    }\r
+    pixParallelStageStops.push_back(stageMiddleSwitch);\r
+    Ncv32u pixParallelStageStopsIndex = 0;\r
+\r
+    if (pixelStep != 1 || bMaskElements)\r
+    {\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+\r
+        dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),\r
+                        (anchorsRoi.height + pixelStep - 1) / pixelStep);\r
+        dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);\r
+\r
+        if (gridInit.x == 0 || gridInit.y == 0)\r
+        {\r
+            numDetections = 0;\r
+            return NCV_SUCCESS;\r
+        }\r
+\r
+        initializeMaskVectorDynTemplate(bMaskElements,\r
+                                        bDoAtomicCompaction,\r
+                                        gridInit, blockInit, cuStream,\r
+                                        d_ptrNowData->ptr(),\r
+                                        d_ptrNowTmp->ptr(),\r
+                                        d_vecPixelMask.length(), d_pixelMask.stride(),\r
+                                        anchorsRoi, pixelStep);\r
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            swap(d_ptrNowData, d_ptrNowTmp);\r
+        }\r
+        else\r
+        {\r
+            NppStStatus nppSt;\r
+            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),\r
+                                      d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+        }\r
+        numDetections = *hp_numDet;\r
+    }\r
+    else\r
+    {\r
+        //\r
+        // 1. Run the first pixel-input pixel-parallel classifier for few stages\r
+        //\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+\r
+        dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),\r
+                   anchorsRoi.height);\r
+        dim3 block1(NUM_THREADS_ANCHORSPARALLEL);\r
+        applyHaarClassifierAnchorParallelDynTemplate(\r
+            true,                         //tbInitMaskPositively\r
+            bTexCacheIImg,                //tbCacheTextureIImg\r
+            bTexCacheCascade,             //tbCacheTextureCascade\r
+            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector\r
+            bDoAtomicCompaction,          //tbDoAtomicCompaction\r
+            grid1,\r
+            block1,\r
+            cuStream,\r
+            d_integralImage.ptr(), d_integralImage.stride(),\r
+            d_weights.ptr(), d_weights.stride(),\r
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+            d_ptrNowData->ptr(),\r
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+            0,\r
+            d_pixelMask.stride(),\r
+            anchorsRoi,\r
+            pixParallelStageStops[pixParallelStageStopsIndex],\r
+            pixParallelStageStops[pixParallelStageStopsIndex+1],\r
+            scaleAreaPixels);\r
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+        else\r
+        {\r
+            NppStStatus nppSt;\r
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),\r
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+        }\r
+\r
+        swap(d_ptrNowData, d_ptrNowTmp);\r
+        numDetections = *hp_numDet;\r
+\r
+        pixParallelStageStopsIndex++;\r
+    }\r
+\r
+    //\r
+    // 2. Run pixel-parallel stages\r
+    //\r
+\r
+    for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)\r
+    {\r
+        if (numDetections == 0)\r
+        {\r
+            break;\r
+        }\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+\r
+        dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);\r
+        if (numDetections > MAX_GRID_DIM)\r
+        {\r
+            grid2.x = MAX_GRID_DIM;\r
+            grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;\r
+        }\r
+        dim3 block2(NUM_THREADS_ANCHORSPARALLEL);\r
+\r
+        applyHaarClassifierAnchorParallelDynTemplate(\r
+            false,                        //tbInitMaskPositively\r
+            bTexCacheIImg,                //tbCacheTextureIImg\r
+            bTexCacheCascade,             //tbCacheTextureCascade\r
+            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector\r
+            bDoAtomicCompaction,          //tbDoAtomicCompaction\r
+            grid2,\r
+            block2,\r
+            cuStream,\r
+            d_integralImage.ptr(), d_integralImage.stride(),\r
+            d_weights.ptr(), d_weights.stride(),\r
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+            d_ptrNowData->ptr(),\r
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+            numDetections,\r
+            d_pixelMask.stride(),\r
+            anchorsRoi,\r
+            pixParallelStageStops[pixParallelStageStopsIndex],\r
+            pixParallelStageStops[pixParallelStageStopsIndex+1],\r
+            scaleAreaPixels);\r
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+        else\r
+        {\r
+            NppStStatus nppSt;\r
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,\r
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+        }\r
+\r
+        swap(d_ptrNowData, d_ptrNowTmp);\r
+        numDetections = *hp_numDet;\r
+    }\r
+\r
+    //\r
+    // 3. Run all left stages in one stage-parallel kernel\r
+    //\r
+\r
+    if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)\r
+    {\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),\r
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+\r
+        dim3 grid3(numDetections);\r
+        if (numDetections > MAX_GRID_DIM)\r
+        {\r
+            grid3.x = MAX_GRID_DIM;\r
+            grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;\r
+        }\r
+        dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);\r
+\r
+        applyHaarClassifierClassifierParallelDynTemplate(\r
+            bTexCacheIImg,                //tbCacheTextureIImg\r
+            bTexCacheCascade,             //tbCacheTextureCascade\r
+            bDoAtomicCompaction,          //tbDoAtomicCompaction\r
+            grid3,\r
+            block3,\r
+            cuStream,\r
+            d_integralImage.ptr(), d_integralImage.stride(),\r
+            d_weights.ptr(), d_weights.stride(),\r
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),\r
+            d_ptrNowData->ptr(),\r
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),\r
+            numDetections,\r
+            d_pixelMask.stride(),\r
+            anchorsRoi,\r
+            stageMiddleSwitch,\r
+            stageEndClassifierParallel,\r
+            scaleAreaPixels);\r
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+        if (bDoAtomicCompaction)\r
+        {\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),\r
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+        else\r
+        {\r
+            NppStStatus nppSt;\r
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,\r
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,\r
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);\r
+            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);\r
+        }\r
+\r
+        swap(d_ptrNowData, d_ptrNowTmp);\r
+        numDetections = *hp_numDet;\r
+    }\r
+\r
+    if (d_ptrNowData != &d_vecPixelMask)\r
+    {\r
+        d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);\r
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+    }\r
+\r
+#if defined _SELF_TEST_\r
+\r
+    ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+\r
+    if (bDoAtomicCompaction)\r
+    {\r
+        std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);\r
+    }\r
+\r
+    Ncv32u fpu_oldcw, fpu_cw;\r
+    _controlfp_s(&fpu_cw, 0, 0);\r
+    fpu_oldcw = fpu_cw;\r
+    _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);\r
+    Ncv32u numDetGold;\r
+    ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,\r
+                                                 h_HaarStages, h_HaarNodes, h_HaarFeatures,\r
+                                                 bMaskElements, anchorsRoi, pixelStep, scaleArea);\r
+    ncvAssertReturnNcvStat(ncvStat);\r
+    _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);\r
+\r
+    bool bPass = true;\r
+\r
+    if (numDetGold != numDetections)\r
+    {\r
+        printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);\r
+        bPass = false;\r
+    }\r
+    else\r
+    {\r
+        for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)\r
+        {\r
+            if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])\r
+            {\r
+                printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);\r
+                bPass = false;\r
+            }\r
+        }\r
+    }\r
+\r
+    printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");\r
+#endif\r
+\r
+    NCV_SKIP_COND_END\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// HypothesesOperations file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u NUM_GROW_THREADS = 128;\r
+\r
+\r
+__device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)\r
+{\r
+    NcvRect32u res;\r
+    res.x = (Ncv32u)(scale * (pixel & 0xFFFF));\r
+    res.y = (Ncv32u)(scale * (pixel >> 16));\r
+    res.width = (Ncv32u)(scale * width);\r
+    res.height = (Ncv32u)(scale * height);\r
+    return res;\r
+}\r
+\r
+\r
+__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,\r
+                                     NcvRect32u *hypotheses,\r
+                                     Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)\r
+{\r
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;\r
+    Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;\r
+    if (elemAddr >= numElements)\r
+    {\r
+        return;\r
+    }\r
+    hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);\r
+}\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,\r
+                                         Ncv32u numPixelMaskDetections,\r
+                                         NCVVector<NcvRect32u> &hypotheses,\r
+                                         Ncv32u &totalDetections,\r
+                                         Ncv32u totalMaxDetections,\r
+                                         Ncv32u rectWidth,\r
+                                         Ncv32u rectHeight,\r
+                                         Ncv32f curScale,\r
+                                         cudaStream_t cuStream)\r
+{\r
+    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);\r
+    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&\r
+                    pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);\r
+    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);\r
+    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&\r
+                    numPixelMaskDetections <= pixelMask.length() &&\r
+                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);\r
+\r
+    NCVStatus ncvStat = NCV_SUCCESS;\r
+    Ncv32u numDetsToCopy = numPixelMaskDetections;\r
+\r
+    if (numDetsToCopy == 0)\r
+    {\r
+        return ncvStat;\r
+    }\r
+\r
+    if (totalDetections + numPixelMaskDetections > totalMaxDetections)\r
+    {\r
+        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+        numDetsToCopy = totalMaxDetections - totalDetections;\r
+    }\r
+\r
+    dim3 block(NUM_GROW_THREADS);\r
+    dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);\r
+    if (grid.x > 65535)\r
+    {\r
+        grid.y = (grid.x + 65534) / 65535;\r
+        grid.x = 65535;\r
+    }\r
+    growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,\r
+                                                       hypotheses.ptr() + totalDetections,\r
+                                                       rectWidth, rectHeight, curScale);\r
+    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+    totalDetections += numDetsToCopy;\r
+    return ncvStat;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Visualize file\r
+//\r
+//==============================================================================\r
+\r
+\r
+const Ncv32u NUMTHREADS_DRAWRECTS = 32;\r
+const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;\r
+\r
+\r
+template <class T>\r
+__global__ void drawRects(T *d_dst,\r
+                          Ncv32u dstStride,\r
+                          Ncv32u dstWidth,\r
+                          Ncv32u dstHeight,\r
+                          NcvRect32u *d_rects,\r
+                          Ncv32u numRects,\r
+                          T color)\r
+{\r
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;\r
+    if (blockId > numRects * 4)\r
+    {\r
+        return;\r
+    }\r
+\r
+    NcvRect32u curRect = d_rects[blockId >> 2];\r
+    NcvBool bVertical = blockId & 0x1;\r
+    NcvBool bTopLeft = blockId & 0x2;\r
+\r
+    Ncv32u pt0x, pt0y;\r
+    if (bVertical)\r
+    {\r
+        Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;\r
+\r
+        pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;\r
+        pt0y = curRect.y;\r
+\r
+        if (pt0x < dstWidth)\r
+        {\r
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)\r
+            {\r
+                Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;\r
+                if (ptY < pt0y + curRect.height && ptY < dstHeight)\r
+                {\r
+                    d_dst[ptY * dstStride + pt0x] = color;\r
+                }\r
+            }\r
+        }\r
+    }\r
+    else\r
+    {\r
+        Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;\r
+\r
+        pt0x = curRect.x;\r
+        pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;\r
+\r
+        if (pt0y < dstHeight)\r
+        {\r
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)\r
+            {\r
+                Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;\r
+                if (ptX < pt0x + curRect.width && ptX < dstWidth)\r
+                {\r
+                    d_dst[pt0y * dstStride + ptX] = color;\r
+                }\r
+            }\r
+        }\r
+    }\r
+}\r
+\r
+\r
+template <class T>\r
+static NCVStatus drawRectsWrapperDevice(T *d_dst,\r
+                                        Ncv32u dstStride,\r
+                                        Ncv32u dstWidth,\r
+                                        Ncv32u dstHeight,\r
+                                        NcvRect32u *d_rects,\r
+                                        Ncv32u numRects,\r
+                                        T color,\r
+                                        cudaStream_t cuStream)\r
+{\r
+    ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);\r
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);\r
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);\r
+\r
+    if (numRects == 0)\r
+    {\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+#if defined _SELF_TEST_\r
+    T *h_dst;\r
+    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+    NcvRect32s *h_rects;\r
+    ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+    ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));\r
+#endif\r
+\r
+    dim3 grid(numRects * 4);\r
+    dim3 block(NUMTHREADS_DRAWRECTS);\r
+    if (grid.x > 65535)\r
+    {\r
+        grid.y = (grid.x + 65534) / 65535;\r
+        grid.x = 65535;\r
+    }\r
+\r
+    drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);\r
+\r
+    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);\r
+\r
+#if defined _SELF_TEST_\r
+    T *h_dst_after;\r
+    ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);\r
+    bool bPass = true;\r
+    for (Ncv32u i=0; i<dstHeight && bPass; i++)\r
+    {\r
+        for (Ncv32u j=0; j<dstWidth && bPass; j++)\r
+        {\r
+            if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])\r
+            {\r
+                printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);\r
+                bPass = false;\r
+            }\r
+        }\r
+    }\r
+    ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);\r
+    ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);\r
+    printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");\r
+#endif\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,\r
+                                 Ncv32u dstStride,\r
+                                 Ncv32u dstWidth,\r
+                                 Ncv32u dstHeight,\r
+                                 NcvRect32u *d_rects,\r
+                                 Ncv32u numRects,\r
+                                 Ncv8u color,\r
+                                 cudaStream_t cuStream)\r
+{\r
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,\r
+                                  Ncv32u dstStride,\r
+                                  Ncv32u dstWidth,\r
+                                  Ncv32u dstHeight,\r
+                                  NcvRect32u *d_rects,\r
+                                  Ncv32u numRects,\r
+                                  Ncv32u color,\r
+                                  cudaStream_t cuStream)\r
+{\r
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Pipeline file\r
+//\r
+//==============================================================================\r
+\r
+\r
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,\r
+                                            NcvSize32u srcRoi,\r
+                                            NCVVector<NcvRect32u> &d_dstRects,\r
+                                            Ncv32u &dstNumRects,\r
+\r
+                                            HaarClassifierCascadeDescriptor &haar,\r
+                                            NCVVector<HaarStage64> &h_HaarStages,\r
+                                            NCVVector<HaarStage64> &d_HaarStages,\r
+                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+                                            NCVVector<HaarFeature64> &d_HaarFeatures,\r
+\r
+                                            NcvSize32u minObjSize,\r
+                                            Ncv32u minNeighbors,      //default 4\r
+                                            Ncv32f scaleStep,         //default 1.2f\r
+                                            Ncv32u pixelStep,         //default 1\r
+                                            Ncv32u flags,             //default NCVPipeObjDet_Default\r
+\r
+                                            INCVMemAllocator &gpuAllocator,\r
+                                            INCVMemAllocator &cpuAllocator,\r
+                                            Ncv32u devPropMajor,\r
+                                            Ncv32u devPropMinor,\r
+                                            cudaStream_t cuStream)\r
+{\r
+    ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&\r
+                    d_srcImg.memType() == gpuAllocator.memType() &&\r
+                     (d_srcImg.memType() == NCVMemoryTypeDevice ||\r
+                      d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&\r
+                    d_HaarStages.memType() == d_HaarFeatures.memType() &&\r
+                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||\r
+                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);\r
+    ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&\r
+                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&\r
+                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);\r
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&\r
+                    d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&\r
+                    srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&\r
+                    d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);\r
+    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&\r
+                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+                    d_HaarFeatures.length() >= haar.NumFeatures &&\r
+                    d_HaarStages.length() == h_HaarStages.length() &&\r
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+    //TODO: set NPP active stream to cuStream\r
+\r
+    NCVStatus ncvStat;\r
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());\r
+\r
+    Ncv32u integralWidth = d_srcImg.width() + 1;\r
+    Ncv32u integralHeight = d_srcImg.height() + 1;\r
+\r
+    NCVMatrixAlloc<Ncv32u> d_integralImage(gpuAllocator, integralWidth, integralHeight);\r
+    ncvAssertReturn(d_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());\r
+    ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());\r
+    ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+    ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);\r
+    ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());\r
+    ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+    NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());\r
+    ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NppStStatus nppStat;\r
+    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;\r
+    nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral);\r
+    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral);\r
+    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));\r
+    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+\r
+    nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),\r
+                                       d_integralImage.ptr(), d_integralImage.pitch(),\r
+                                       NppStSize32u(d_srcImg.width(), d_srcImg.height()),\r
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral);\r
+    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+    nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),\r
+                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),\r
+                                          NppStSize32u(d_srcImg.width(), d_srcImg.height()),\r
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral);\r
+    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+    NCV_SKIP_COND_END\r
+\r
+    dstNumRects = 0;\r
+\r
+    Ncv32u lastCheckedScale = 0;\r
+    NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);\r
+    std::vector<Ncv32u> scalesVector;\r
+\r
+    NcvBool bFoundLargestFace = false;\r
+\r
+    for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)\r
+    {\r
+        Ncv32u scale = (Ncv32u)scaleIter;\r
+        if (lastCheckedScale == scale)\r
+        {\r
+            continue;\r
+        }\r
+        lastCheckedScale = scale;\r
+\r
+        if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||\r
+            haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)\r
+        {\r
+            continue;\r
+        }\r
+\r
+        NcvSize32s srcRoi, srcIIRoi, scaledIIRoi, searchRoi;\r
+\r
+        srcRoi.width = d_srcImg.width();\r
+        srcRoi.height = d_srcImg.height();\r
+\r
+        srcIIRoi.width = srcRoi.width + 1;\r
+        srcIIRoi.height = srcRoi.height + 1;\r
+\r
+        scaledIIRoi.width = srcIIRoi.width / scale;\r
+        scaledIIRoi.height = srcIIRoi.height / scale;\r
+\r
+        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;\r
+        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;\r
+\r
+        if (searchRoi.width <= 0 || searchRoi.height <= 0)\r
+        {\r
+            break;\r
+        }\r
+\r
+        scalesVector.push_back(scale);\r
+\r
+        if (gpuAllocator.isCounting())\r
+        {\r
+            break;\r
+        }\r
+    }\r
+\r
+    if (bReverseTraverseScale)\r
+    {\r
+        std::reverse(scalesVector.begin(), scalesVector.end());\r
+    }\r
+\r
+    //TODO: handle _fair_scale_ flag\r
+    for (Ncv32u i=0; i<scalesVector.size(); i++)\r
+    {\r
+        Ncv32u scale = scalesVector[i];\r
+\r
+        NcvSize32u srcRoi, scaledIIRoi, searchRoi;\r
+        NppStSize32u srcIIRoi;\r
+        srcRoi.width = d_srcImg.width();\r
+        srcRoi.height = d_srcImg.height();\r
+        srcIIRoi.width = srcRoi.width + 1;\r
+        srcIIRoi.height = srcRoi.height + 1;\r
+        scaledIIRoi.width = srcIIRoi.width / scale;\r
+        scaledIIRoi.height = srcIIRoi.height / scale;\r
+        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;\r
+        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;\r
+\r
+        NCV_SKIP_COND_BEGIN\r
+\r
+        nppStat = nppiStDownsampleNearest_32u_C1R(\r
+            d_integralImage.ptr(), d_integralImage.pitch(),\r
+            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),\r
+            srcIIRoi, scale, true);\r
+        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+        nppStat = nppiStDownsampleNearest_64u_C1R(\r
+            d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),\r
+            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),\r
+            srcIIRoi, scale, true);\r
+        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+        const NppStRect32u rect(\r
+            HAAR_STDDEV_BORDER,\r
+            HAAR_STDDEV_BORDER,\r
+            haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,\r
+            haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);\r
+        nppStat = nppiStRectStdDev_32f_C1R(\r
+            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),\r
+            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),\r
+            d_rectStdDev.ptr(), d_rectStdDev.pitch(),\r
+            NppStSize32u(searchRoi.width, searchRoi.height), rect,\r
+            (Ncv32f)scale*scale, true);\r
+        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);\r
+\r
+        NCV_SKIP_COND_END\r
+\r
+        Ncv32u detectionsOnThisScale;\r
+        ncvStat = ncvApplyHaarClassifierCascade_device(\r
+            d_scaledIntegralImage, d_rectStdDev, d_pixelMask,\r
+            detectionsOnThisScale,\r
+            haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,\r
+            searchRoi, pixelStep, (Ncv32f)scale*scale,\r
+            gpuAllocator, cpuAllocator, devPropMajor, devPropMinor, cuStream);\r
+        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+\r
+        NCV_SKIP_COND_BEGIN\r
+\r
+        NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());\r
+        ncvStat = ncvGrowDetectionsVector_device(\r
+            d_vecPixelMask,\r
+            detectionsOnThisScale,\r
+            d_hypothesesIntermediate,\r
+            dstNumRects,\r
+            d_hypothesesIntermediate.length(),\r
+            haar.ClassifierSize.width,\r
+            haar.ClassifierSize.height,\r
+            (Ncv32f)scale,\r
+            cuStream);\r
+        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);\r
+\r
+        if (flags & NCVPipeObjDet_FindLargestObject)\r
+        {\r
+            if (dstNumRects == 0)\r
+            {\r
+                continue;\r
+            }\r
+\r
+            if (dstNumRects != 0)\r
+            {\r
+                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+                ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,\r
+                                                             dstNumRects * sizeof(NcvRect32u));\r
+                ncvAssertReturnNcvStat(ncvStat);\r
+                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            }\r
+\r
+            Ncv32u numStrongHypothesesNow = dstNumRects;\r
+            ncvStat = ncvFilterHypotheses_host(\r
+                h_hypothesesIntermediate,\r
+                numStrongHypothesesNow,\r
+                minNeighbors,\r
+                RECT_SIMILARITY_PROPORTION,\r
+                NULL);\r
+            ncvAssertReturnNcvStat(ncvStat);\r
+\r
+            if (numStrongHypothesesNow > 0)\r
+            {\r
+                NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];\r
+                for (Ncv32u j=1; j<numStrongHypothesesNow; j++)\r
+                {\r
+                    if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)\r
+                    {\r
+                        maxRect = h_hypothesesIntermediate.ptr()[j];\r
+                    }\r
+                }\r
+\r
+                h_hypothesesIntermediate.ptr()[0] = maxRect;\r
+                dstNumRects = 1;\r
+\r
+                ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));\r
+                ncvAssertReturnNcvStat(ncvStat);\r
+\r
+                bFoundLargestFace = true;\r
+\r
+                break;\r
+            }\r
+        }\r
+\r
+        NCV_SKIP_COND_END\r
+\r
+        if (gpuAllocator.isCounting())\r
+        {\r
+            break;\r
+        }\r
+    }\r
+\r
+    NCVStatus ncvRetCode = NCV_SUCCESS;\r
+\r
+    NCV_SKIP_COND_BEGIN\r
+\r
+    if (flags & NCVPipeObjDet_FindLargestObject)\r
+    {\r
+        if (!bFoundLargestFace)\r
+        {\r
+            dstNumRects = 0;\r
+        }\r
+    }\r
+    else\r
+    {\r
+        //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)\r
+        if (dstNumRects != 0)\r
+        {\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+            ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,\r
+                                                         dstNumRects * sizeof(NcvRect32u));\r
+            ncvAssertReturnNcvStat(ncvStat);\r
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        }\r
+\r
+        ncvStat = ncvFilterHypotheses_host(\r
+            h_hypothesesIntermediate,\r
+            dstNumRects,\r
+            minNeighbors,\r
+            RECT_SIMILARITY_PROPORTION,\r
+            NULL);\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+\r
+        if (dstNumRects > d_dstRects.length())\r
+        {\r
+            ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+            dstNumRects = d_dstRects.length();\r
+        }\r
+\r
+        if (dstNumRects != 0)\r
+        {\r
+            ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,\r
+                                                         dstNumRects * sizeof(NcvRect32u));\r
+            ncvAssertReturnNcvStat(ncvStat);\r
+        }\r
+    }\r
+\r
+    if (flags & NCVPipeObjDet_VisualizeInPlace)\r
+    {\r
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);\r
+        ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),\r
+                               d_srcImg.width(), d_srcImg.height(),\r
+                               d_dstRects.ptr(), dstNumRects, 255, cuStream);\r
+    }\r
+\r
+    NCV_SKIP_COND_END\r
+\r
+    return ncvRetCode;\r
+}\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Purely Host code: classifier IO, mock-ups\r
+//\r
+//==============================================================================\r
+\r
+\r
+#ifdef _SELF_TEST_\r
+#include <float.h>\r
+#endif\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,\r
+                                             NCVMatrix<Ncv32f> &h_weights,\r
+                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,\r
+                                             Ncv32u &numDetections,\r
+                                             HaarClassifierCascadeDescriptor &haar,\r
+                                             NCVVector<HaarStage64> &h_HaarStages,\r
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                             NCVVector<HaarFeature64> &h_HaarFeatures,\r
+                                             NcvBool bMaskElements,\r
+                                             NcvSize32u anchorsRoi,\r
+                                             Ncv32u pixelStep,\r
+                                             Ncv32f scaleArea)\r
+{\r
+    ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&\r
+                    h_integralImage.memType() == h_pixelMask.memType() &&\r
+                     (h_integralImage.memType() == NCVMemoryTypeHostPageable ||\r
+                      h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&\r
+                    h_HaarStages.memType() == h_HaarFeatures.memType() &&\r
+                     (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||\r
+                      h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&\r
+                    h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);\r
+    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&\r
+                    h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&\r
+                    h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&\r
+                    h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&\r
+                    h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);\r
+    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&\r
+                    h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&\r
+                    h_HaarFeatures.length() >= haar.NumFeatures &&\r
+                    h_HaarStages.length() == h_HaarStages.length() &&\r
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);\r
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);\r
+\r
+    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *\r
+                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));\r
+\r
+    for (Ncv32u i=0; i<anchorsRoi.height; i++)\r
+    {\r
+        for (Ncv32u j=0; j<h_pixelMask.stride(); j++)\r
+        {\r
+            if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)\r
+            {\r
+                h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+            }\r
+            else\r
+            {\r
+                for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)\r
+                {\r
+                    Ncv32f curStageSum = 0.0f;\r
+                    Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();\r
+                    Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();\r
+\r
+                    if (iStage == 0)\r
+                    {\r
+                        if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+                        {\r
+                            break;\r
+                        }\r
+                        else\r
+                        {\r
+                            h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);\r
+                        }\r
+                    }\r
+                    else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+                    {\r
+                        break;\r
+                    }\r
+\r
+                    while (numRootNodesInStage--)\r
+                    {\r
+                        NcvBool bMoreNodesToTraverse = true;\r
+                        Ncv32u curNodeOffset = curRootNodeOffset;\r
+\r
+                        while (bMoreNodesToTraverse)\r
+                        {\r
+                            HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];\r
+                            Ncv32u curNodeFeaturesNum = curNode.getFeatureDesc().getNumFeatures();\r
+                            Ncv32u curNodeFeaturesOffs = curNode.getFeatureDesc().getFeaturesOffset();\r
+\r
+                            Ncv32f curNodeVal = 0.f;\r
+                            for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+                            {\r
+                                HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];\r
+                                Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+                                feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);\r
+                                Ncv32f rectWeight = feature.getWeight();\r
+                                Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);\r
+                                Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+                                Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();\r
+                                Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+                                Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];\r
+                                Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];\r
+                                Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];\r
+                                Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];\r
+                                Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;\r
+                                curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+                            }\r
+\r
+                            HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+                            HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+                            Ncv32f nodeThreshold = curNode.getThreshold();\r
+                            HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+\r
+                            if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)\r
+                            {\r
+                                nextNodeDescriptor = nodeLeft;\r
+                            }\r
+                            else\r
+                            {\r
+                                nextNodeDescriptor = nodeRight;\r
+                            }\r
+\r
+                            NcvBool tmpIsLeaf = nextNodeDescriptor.isLeaf();\r
+                            if (tmpIsLeaf)\r
+                            {\r
+                                Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();\r
+                                curStageSum += tmpLeafValue;\r
+                                bMoreNodesToTraverse = false;\r
+                            }\r
+                            else\r
+                            {\r
+                                curNodeOffset = nextNodeDescriptor.getNextNodeOffset();\r
+                            }\r
+                        }\r
+\r
+                        curRootNodeOffset++;\r
+                    }\r
+\r
+                    Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();\r
+                    if (curStageSum < tmpStageThreshold)\r
+                    {\r
+                        //drop\r
+                        h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;\r
+                        break;\r
+                    }\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+    std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());\r
+    Ncv32u i = 0;\r
+    for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)\r
+    {\r
+        if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)\r
+        {\r
+            break;\r
+        }\r
+    }\r
+    numDetections = i;\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,\r
+                                       Ncv32u numPixelMaskDetections,\r
+                                       NCVVector<NcvRect32u> &hypotheses,\r
+                                       Ncv32u &totalDetections,\r
+                                       Ncv32u totalMaxDetections,\r
+                                       Ncv32u rectWidth,\r
+                                       Ncv32u rectHeight,\r
+                                       Ncv32f curScale)\r
+{\r
+    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);\r
+    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&\r
+                    pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);\r
+    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);\r
+    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);\r
+    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&\r
+                    numPixelMaskDetections <= pixelMask.length() &&\r
+                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);\r
+\r
+    NCVStatus ncvStat = NCV_SUCCESS;\r
+    Ncv32u numDetsToCopy = numPixelMaskDetections;\r
+\r
+    if (numDetsToCopy == 0)\r
+    {\r
+        return ncvStat;\r
+    }\r
+\r
+    if (totalDetections + numPixelMaskDetections > totalMaxDetections)\r
+    {\r
+        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;\r
+        numDetsToCopy = totalMaxDetections - totalDetections;\r
+    }\r
+\r
+    for (Ncv32u i=0; i<numDetsToCopy; i++)\r
+    {\r
+        hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);\r
+    }\r
+\r
+    totalDetections += numDetsToCopy;\r
+    return ncvStat;\r
+}\r
+\r
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,\r
+                                   Ncv32u &numHypotheses,\r
+                                   Ncv32u minNeighbors,\r
+                                   Ncv32f intersectEps,\r
+                                   NCVVector<Ncv32u> *hypothesesWeights)\r
+{\r
+    ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||\r
+                    hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+    if (hypothesesWeights != NULL)\r
+    {\r
+        ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||\r
+                        hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+    }\r
+\r
+    if (numHypotheses == 0)\r
+    {\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    std::vector<NcvRect32u> rects(numHypotheses);\r
+    memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));\r
+\r
+    std::vector<Ncv32u> weights;\r
+    if (hypothesesWeights != NULL)\r
+    {\r
+        groupRectangles(rects, minNeighbors, intersectEps, &weights);\r
+    }\r
+    else\r
+    {\r
+        groupRectangles(rects, minNeighbors, intersectEps, NULL);\r
+    }\r
+\r
+    numHypotheses = (Ncv32u)rects.size();\r
+    if (numHypotheses > 0)\r
+    {\r
+        memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));\r
+    }\r
+\r
+    if (hypothesesWeights != NULL)\r
+    {\r
+        memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));\r
+    }\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+template <class T>\r
+static NCVStatus drawRectsWrapperHost(T *h_dst,\r
+                                      Ncv32u dstStride,\r
+                                      Ncv32u dstWidth,\r
+                                      Ncv32u dstHeight,\r
+                                      NcvRect32u *h_rects,\r
+                                      Ncv32u numRects,\r
+                                      T color)\r
+{\r
+    ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);\r
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);\r
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);\r
+    ncvAssertReturn(numRects != 0, NCV_SUCCESS);\r
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);\r
+\r
+    for (Ncv32u i=0; i<numRects; i++)\r
+    {\r
+        NcvRect32u rect = h_rects[i];\r
+\r
+        if (rect.x < dstWidth)\r
+        {\r
+            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)\r
+            {\r
+                h_dst[i*dstStride+rect.x] = color;\r
+            }\r
+        }\r
+        if (rect.x+rect.width-1 < dstWidth)\r
+        {\r
+            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)\r
+            {\r
+                h_dst[i*dstStride+rect.x+rect.width-1] = color;\r
+            }\r
+        }\r
+        if (rect.y < dstHeight)\r
+        {\r
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)\r
+            {\r
+                h_dst[rect.y*dstStride+j] = color;\r
+            }\r
+        }\r
+        if (rect.y + rect.height - 1 < dstHeight)\r
+        {\r
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)\r
+            {\r
+                h_dst[(rect.y+rect.height-1)*dstStride+j] = color;\r
+            }\r
+        }\r
+    }\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,\r
+                               Ncv32u dstStride,\r
+                               Ncv32u dstWidth,\r
+                               Ncv32u dstHeight,\r
+                               NcvRect32u *h_rects,\r
+                               Ncv32u numRects,\r
+                               Ncv8u color)\r
+{\r
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);\r
+}\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,\r
+                                Ncv32u dstStride,\r
+                                Ncv32u dstWidth,\r
+                                Ncv32u dstHeight,\r
+                                NcvRect32u *h_rects,\r
+                                Ncv32u numRects,\r
+                                Ncv32u color)\r
+{\r
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);\r
+}\r
+\r
+\r
+NCVStatus loadFromXML(const std::string &filename,\r
+                      HaarClassifierCascadeDescriptor &haar,\r
+                      std::vector<HaarStage64> &haarStages,\r
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+                      std::vector<HaarFeature64> &haarFeatures);\r
+\r
+\r
+#define NVBIN_HAAR_SIZERESERVED     16\r
+#define NVBIN_HAAR_VERSION          0x1\r
+\r
+\r
+static NCVStatus loadFromNVBIN(const std::string &filename,\r
+                               HaarClassifierCascadeDescriptor &haar,\r
+                               std::vector<HaarStage64> &haarStages,\r
+                               std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
+                               std::vector<HaarFeature64> &haarFeatures)\r
+{\r
+    FILE *fp;\r
+    fopen_s(&fp, filename.c_str(), "rb");\r
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+    Ncv32u fileVersion;\r
+    fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+    ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
+    Ncv32u fsize;\r
+    fread_s(&fsize, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+    fseek(fp, 0, SEEK_END);\r
+    Ncv32u fsizeActual = ftell(fp);\r
+    ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);\r
+\r
+    std::vector<unsigned char> fdata;\r
+    fdata.resize(fsize);\r
+    Ncv32u dataOffset = 0;\r
+    fseek(fp, 0, SEEK_SET);\r
+    fread_s(&fdata[0], fsize, fsize, 1, fp);\r
+    fclose(fp);\r
+\r
+    //data\r
+    dataOffset = NVBIN_HAAR_SIZERESERVED;\r
+    haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(Ncv32u);\r
+    haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(Ncv32u);\r
+    haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(Ncv32u);\r
+    haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(Ncv32u);\r
+    haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(NcvSize32u);\r
+    haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(NcvBool);\r
+    haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);\r
+    dataOffset += sizeof(NcvBool);\r
+\r
+    haarStages.resize(haar.NumStages);\r
+    haarClassifierNodes.resize(haar.NumClassifierTotalNodes);\r
+    haarFeatures.resize(haar.NumFeatures);\r
+\r
+    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);\r
+    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);\r
+    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);\r
+\r
+    memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);\r
+    dataOffset += szStages;\r
+    memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);\r
+    dataOffset += szClassifiers;\r
+    memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);\r
+    dataOffset += szFeatures;\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,\r
+                                   Ncv32u &numNodes, Ncv32u &numFeatures)\r
+{\r
+    NCVStatus ncvStat;\r
+\r
+    std::string fext = filename.substr(filename.find_last_of(".") + 1);\r
+    std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);\r
+\r
+    if (fext == "nvbin")\r
+    {\r
+        FILE *fp;\r
+        fopen_s(&fp, filename.c_str(), "rb");\r
+        ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+        Ncv32u fileVersion;\r
+        fread_s(&fileVersion, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+        ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
+        fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);\r
+        Ncv32u tmp;\r
+        fread_s(&numStages, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+        fread_s(&tmp, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+        fread_s(&numNodes, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+        fread_s(&numFeatures, sizeof(Ncv32u), sizeof(Ncv32u), 1, fp);\r
+        fclose(fp);\r
+    }\r
+    else if (fext == "xml")\r
+    {\r
+        HaarClassifierCascadeDescriptor haar;\r
+        std::vector<HaarStage64> haarStages;\r
+        std::vector<HaarClassifierNode128> haarNodes;\r
+        std::vector<HaarFeature64> haarFeatures;\r
+\r
+        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+\r
+        numStages = haar.NumStages;\r
+        numNodes = haar.NumClassifierTotalNodes;\r
+        numFeatures = haar.NumFeatures;\r
+    }\r
+    else\r
+    {\r
+        return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+    }\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,\r
+                                   HaarClassifierCascadeDescriptor &haar,\r
+                                   NCVVector<HaarStage64> &h_HaarStages,\r
+                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                   NCVVector<HaarFeature64> &h_HaarFeatures)\r
+{\r
+    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&\r
+                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&\r
+                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+\r
+    NCVStatus ncvStat;\r
+\r
+    std::string fext = filename.substr(filename.find_last_of(".") + 1);\r
+    std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);\r
+\r
+    std::vector<HaarStage64> haarStages;\r
+    std::vector<HaarClassifierNode128> haarNodes;\r
+    std::vector<HaarFeature64> haarFeatures;\r
+\r
+    if (fext == "nvbin")\r
+    {\r
+        ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+    }\r
+    else if (fext == "xml")\r
+    {\r
+        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);\r
+        ncvAssertReturnNcvStat(ncvStat);\r
+    }\r
+    else\r
+    {\r
+        return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+    }\r
+\r
+    ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+    ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+    ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);\r
+\r
+    memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));\r
+    memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));\r
+    memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));\r
+\r
+    return NCV_SUCCESS;\r
+}\r
+\r
+\r
+NCVStatus ncvHaarStoreNVBIN_host(std::string &filename,\r
+                                 HaarClassifierCascadeDescriptor haar,\r
+                                 NCVVector<HaarStage64> &h_HaarStages,\r
+                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                 NCVVector<HaarFeature64> &h_HaarFeatures)\r
+{\r
+    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);\r
+    ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);\r
+    ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);\r
+    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&\r
+                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&\r
+                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);\r
+\r
+    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);\r
+    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);\r
+    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);\r
+\r
+    Ncv32u dataOffset = 0;\r
+    std::vector<unsigned char> fdata;\r
+    fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);\r
+\r
+    //header\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;\r
+\r
+    //data\r
+    dataOffset = NVBIN_HAAR_SIZERESERVED;\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;\r
+    dataOffset += sizeof(Ncv32u);\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;\r
+    dataOffset += sizeof(Ncv32u);\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;\r
+    dataOffset += sizeof(Ncv32u);\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;\r
+    dataOffset += sizeof(Ncv32u);\r
+    *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;\r
+    dataOffset += sizeof(NcvSize32u);\r
+    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;\r
+    dataOffset += sizeof(NcvBool);\r
+    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;\r
+    dataOffset += sizeof(NcvBool);\r
+\r
+    memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);\r
+    dataOffset += szStages;\r
+    memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);\r
+    dataOffset += szClassifiers;\r
+    memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);\r
+    dataOffset += szFeatures;\r
+    Ncv32u fsize = dataOffset;\r
+\r
+    //TODO: CRC32 here\r
+\r
+    //update header\r
+    dataOffset = sizeof(Ncv32u);\r
+    *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;\r
+\r
+    FILE *fp;\r
+    fopen_s(&fp, filename.c_str(), "wb");\r
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
+    fwrite(&fdata[0], fsize, 1, fp);\r
+    fclose(fp);\r
+    return NCV_SUCCESS;\r
+}\r
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
new file mode 100644 (file)
index 0000000..d9f500d
--- /dev/null
@@ -0,0 +1,501 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. \r
+// \r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework\r
+//\r
+// The algorithm and code are explained in the upcoming GPU Computing Gems\r
+// chapter in detail:\r
+//\r
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"\r
+//   PDF URL placeholder\r
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com\r
+//\r
+// Credits for help with the code to:\r
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.\r
+//\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#ifndef _ncvhaarobjectdetection_hpp_\r
+#define _ncvhaarobjectdetection_hpp_\r
+\r
+#include <string>\r
+#include "NCV.hpp"\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Guaranteed size cross-platform classifier structures\r
+//\r
+//==============================================================================\r
+\r
+\r
+struct HaarFeature64\r
+{\r
+    uint2 _ui2;\r
+\r
+#define HaarFeature64_CreateCheck_MaxRectField                  0xFF\r
+\r
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)\r
+    {\r
+        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);\r
+        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;\r
+        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;\r
+        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;\r
+        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setWeight(Ncv32f weight)\r
+    {\r
+        ((Ncv32f*)&(this->_ui2.y))[0] = weight;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)\r
+    {\r
+        NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x);\r
+        *rectX = tmpRect.x;\r
+        *rectY = tmpRect.y;\r
+        *rectWidth = tmpRect.width;\r
+        *rectHeight = tmpRect.height;\r
+    }\r
+\r
+    __device__ __host__ Ncv32f getWeight(void)\r
+    {\r
+        return *(Ncv32f*)(&this->_ui2.y);\r
+    }\r
+};\r
+\r
+\r
+struct HaarFeatureDescriptor32\r
+{\r
+private:\r
+\r
+#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted        0x80000000\r
+#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures      0x7F\r
+#define HaarFeatureDescriptor32_NumFeatures_Shift               24\r
+#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset    0x00FFFFFF\r
+\r
+    Ncv32u desc;\r
+\r
+public:\r
+\r
+    __host__ NCVStatus create(NcvBool bTilted, Ncv32u numFeatures, Ncv32u offsetFeatures)\r
+    {\r
+        if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures)\r
+        {\r
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER;\r
+        }\r
+        if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset)\r
+        {\r
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE;\r
+        }\r
+        this->desc = 0;\r
+        this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0);\r
+        this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift);\r
+        this->desc |= offsetFeatures;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __device__ __host__ NcvBool isTilted(void)\r
+    {\r
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0;\r
+    }\r
+\r
+    __device__ __host__ Ncv32u getNumFeatures(void)\r
+    {\r
+        return (this->desc & ~HaarFeatureDescriptor32_Interpret_MaskFlagTilted) >> HaarFeatureDescriptor32_NumFeatures_Shift;\r
+    }\r
+\r
+    __device__ __host__ Ncv32u getFeaturesOffset(void)\r
+    {\r
+        return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset;\r
+    }\r
+};\r
+\r
+\r
+struct HaarClassifierNodeDescriptor32\r
+{\r
+    uint1 _ui1;\r
+\r
+#define HaarClassifierNodeDescriptor32_Interpret_MaskSwitch     (1 << 30)\r
+\r
+    __host__ NCVStatus create(Ncv32f leafValue)\r
+    {\r
+        if ((*(Ncv32u *)&leafValue) & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)\r
+        {\r
+            return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+        }\r
+        *(Ncv32f *)&this->_ui1 = leafValue;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode)\r
+    {\r
+        if (offsetHaarClassifierNode >= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)\r
+        {\r
+            return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+        }\r
+        this->_ui1.x = offsetHaarClassifierNode;\r
+        this->_ui1.x |= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __device__ __host__ NcvBool isLeaf(void)\r
+    {\r
+        return !(this->_ui1.x & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);\r
+    }\r
+\r
+    __host__ Ncv32f getLeafValueHost(void)\r
+    {\r
+        return *(Ncv32f *)&this->_ui1.x;\r
+    }\r
+\r
+#ifdef __CUDACC__\r
+    __device__ Ncv32f getLeafValue(void)\r
+    {\r
+        return __int_as_float(this->_ui1.x);\r
+    }\r
+#endif\r
+\r
+    __device__ __host__ Ncv32u getNextNodeOffset(void)\r
+    {\r
+        return (this->_ui1.x & ~HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);\r
+    }\r
+};\r
+\r
+\r
+struct HaarClassifierNode128\r
+{\r
+    uint4 _ui4;\r
+\r
+    __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f)\r
+    {\r
+        this->_ui4.x = *(Ncv32u *)&f;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setThreshold(Ncv32f t)\r
+    {\r
+        this->_ui4.y = *(Ncv32u *)&t;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)\r
+    {\r
+        this->_ui4.z = *(Ncv32u *)&nl;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)\r
+    {\r
+        this->_ui4.w = *(Ncv32u *)&nr;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void)\r
+    {\r
+        return *(HaarFeatureDescriptor32 *)&this->_ui4.x;\r
+    }\r
+\r
+    __host__ __device__ Ncv32f getThreshold(void)\r
+    {\r
+        return *(Ncv32f*)&this->_ui4.y;\r
+    }\r
+\r
+    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)\r
+    {\r
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z;\r
+    }\r
+\r
+    __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void)\r
+    {\r
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w;\r
+    }\r
+};\r
+\r
+\r
+struct HaarStage64\r
+{\r
+#define HaarStage64_Interpret_MaskRootNodes         0x0000FFFF\r
+#define HaarStage64_Interpret_MaskRootNodeOffset    0xFFFF0000\r
+#define HaarStage64_Interpret_ShiftRootNodeOffset   16\r
+\r
+    uint2 _ui2;\r
+\r
+    __host__ NCVStatus setStageThreshold(Ncv32f t)\r
+    {\r
+        this->_ui2.x = *(Ncv32u *)&t;\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val)\r
+    {\r
+        if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset))\r
+        {\r
+            return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+        }\r
+        this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val)\r
+    {\r
+        if (val > HaarStage64_Interpret_MaskRootNodes)\r
+        {\r
+            return NCV_HAAR_XML_LOADING_EXCEPTION;\r
+        }\r
+        this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset);\r
+        return NCV_SUCCESS;\r
+    }\r
+\r
+    __host__ __device__ Ncv32f getStageThreshold(void)\r
+    {\r
+        return *(Ncv32f*)&this->_ui2.x;\r
+    }\r
+\r
+    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)\r
+    {\r
+        return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset);\r
+    }\r
+\r
+    __host__ __device__ Ncv32u getNumClassifierRootNodes(void)\r
+    {\r
+        return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);\r
+    }\r
+};\r
+\r
+\r
+NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);\r
+NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);\r
+NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);\r
+NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);\r
+NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Classifier cascade descriptor\r
+//\r
+//==============================================================================\r
+\r
+\r
+struct HaarClassifierCascadeDescriptor\r
+{\r
+    Ncv32u NumStages;\r
+    Ncv32u NumClassifierRootNodes;\r
+    Ncv32u NumClassifierTotalNodes;\r
+    Ncv32u NumFeatures;\r
+    NcvSize32u ClassifierSize;\r
+    NcvBool bNeedsTiltedII;\r
+    NcvBool bHasStumpsOnly;\r
+};\r
+\r
+\r
+//==============================================================================\r
+//\r
+// Functional interface\r
+//\r
+//==============================================================================\r
+\r
+\r
+enum\r
+{\r
+    NCVPipeObjDet_Default               = 0x000,\r
+    NCVPipeObjDet_UseFairImageScaling   = 0x001,\r
+    NCVPipeObjDet_FindLargestObject     = 0x002,\r
+    NCVPipeObjDet_VisualizeInPlace      = 0x004,\r
+};\r
+\r
+\r
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,\r
+                                            NcvSize32u srcRoi,\r
+                                            NCVVector<NcvRect32u> &d_dstRects,\r
+                                            Ncv32u &dstNumRects,\r
+\r
+                                            HaarClassifierCascadeDescriptor &haar,\r
+                                            NCVVector<HaarStage64> &h_HaarStages,\r
+                                            NCVVector<HaarStage64> &d_HaarStages,\r
+                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+                                            NCVVector<HaarFeature64> &d_HaarFeatures,\r
+\r
+                                            NcvSize32u minObjSize,\r
+                                            Ncv32u minNeighbors,      //default 4\r
+                                            Ncv32f scaleStep,         //default 1.2f\r
+                                            Ncv32u pixelStep,         //default 1\r
+                                            Ncv32u flags,             //default NCVPipeObjDet_Default\r
+\r
+                                            INCVMemAllocator &gpuAllocator,\r
+                                            INCVMemAllocator &cpuAllocator,\r
+                                            Ncv32u devPropMajor,\r
+                                            Ncv32u devPropMinor,\r
+                                            cudaStream_t cuStream);\r
+\r
+\r
+#define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF\r
+#define HAAR_STDDEV_BORDER                  1\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,\r
+                                               NCVMatrix<Ncv32f> &d_weights,\r
+                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,\r
+                                               Ncv32u &numDetections,\r
+                                               HaarClassifierCascadeDescriptor &haar,\r
+                                               NCVVector<HaarStage64> &h_HaarStages,\r
+                                               NCVVector<HaarStage64> &d_HaarStages,\r
+                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,\r
+                                               NCVVector<HaarFeature64> &d_HaarFeatures,\r
+                                               NcvBool bMaskElements,\r
+                                               NcvSize32u anchorsRoi,\r
+                                               Ncv32u pixelStep,\r
+                                               Ncv32f scaleArea,\r
+                                               INCVMemAllocator &gpuAllocator,\r
+                                               INCVMemAllocator &cpuAllocator,\r
+                                               Ncv32u devPropMajor,\r
+                                               Ncv32u devPropMinor,\r
+                                               cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,\r
+                                             NCVMatrix<Ncv32f> &h_weights,\r
+                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,\r
+                                             Ncv32u &numDetections,\r
+                                             HaarClassifierCascadeDescriptor &haar,\r
+                                             NCVVector<HaarStage64> &h_HaarStages,\r
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                             NCVVector<HaarFeature64> &h_HaarFeatures,\r
+                                             NcvBool bMaskElements,\r
+                                             NcvSize32u anchorsRoi,\r
+                                             Ncv32u pixelStep,\r
+                                             Ncv32f scaleArea);\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,\r
+                                 Ncv32u dstStride,\r
+                                 Ncv32u dstWidth,\r
+                                 Ncv32u dstHeight,\r
+                                 NcvRect32u *d_rects,\r
+                                 Ncv32u numRects,\r
+                                 Ncv8u color,\r
+                                 cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,\r
+                                  Ncv32u dstStride,\r
+                                  Ncv32u dstWidth,\r
+                                  Ncv32u dstHeight,\r
+                                  NcvRect32u *d_rects,\r
+                                  Ncv32u numRects,\r
+                                  Ncv32u color,\r
+                                  cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,\r
+                               Ncv32u dstStride,\r
+                               Ncv32u dstWidth,\r
+                               Ncv32u dstHeight,\r
+                               NcvRect32u *h_rects,\r
+                               Ncv32u numRects,\r
+                               Ncv8u color);\r
+\r
+\r
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,\r
+                                Ncv32u dstStride,\r
+                                Ncv32u dstWidth,\r
+                                Ncv32u dstHeight,\r
+                                NcvRect32u *h_rects,\r
+                                Ncv32u numRects,\r
+                                Ncv32u color);\r
+\r
+\r
+#define RECT_SIMILARITY_PROPORTION      0.2f\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,\r
+                                         Ncv32u numPixelMaskDetections,\r
+                                         NCVVector<NcvRect32u> &hypotheses,\r
+                                         Ncv32u &totalDetections,\r
+                                         Ncv32u totalMaxDetections,\r
+                                         Ncv32u rectWidth,\r
+                                         Ncv32u rectHeight,\r
+                                         Ncv32f curScale,\r
+                                         cudaStream_t cuStream);\r
+\r
+\r
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,\r
+                                       Ncv32u numPixelMaskDetections,\r
+                                       NCVVector<NcvRect32u> &hypotheses,\r
+                                       Ncv32u &totalDetections,\r
+                                       Ncv32u totalMaxDetections,\r
+                                       Ncv32u rectWidth,\r
+                                       Ncv32u rectHeight,\r
+                                       Ncv32f curScale);\r
+\r
+\r
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,\r
+                                   Ncv32u &numHypotheses,\r
+                                   Ncv32u minNeighbors,\r
+                                   Ncv32f intersectEps,\r
+                                   NCVVector<Ncv32u> *hypothesesWeights);\r
+\r
+\r
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,\r
+                                   Ncv32u &numNodes, Ncv32u &numFeatures);\r
+\r
+\r
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,\r
+                                   HaarClassifierCascadeDescriptor &haar,\r
+                                   NCVVector<HaarStage64> &h_HaarStages,\r
+                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                   NCVVector<HaarFeature64> &h_HaarFeatures);\r
+\r
+\r
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,\r
+                                 HaarClassifierCascadeDescriptor haar,\r
+                                 NCVVector<HaarStage64> &h_HaarStages,\r
+                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,\r
+                                 NCVVector<HaarFeature64> &h_HaarFeatures);\r
+\r
+\r
+\r
+#endif // _ncvhaarobjectdetection_hpp_\r
diff --git a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
new file mode 100644 (file)
index 0000000..14d16bb
--- /dev/null
@@ -0,0 +1,174 @@
+////////////////////////////////////////////////////////////////////////////////\r
+// The Loki Library\r
+// Copyright (c) 2001 by Andrei Alexandrescu\r
+// This code accompanies the book:\r
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design \r
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.\r
+// Permission to use, copy, modify, distribute and sell this software for any \r
+//     purpose is hereby granted without fee, provided that the above copyright \r
+//     notice appear in all copies and that both that copyright notice and this \r
+//     permission notice appear in supporting documentation.\r
+// The author or Addison-Welsey Longman make no representations about the \r
+//     suitability of this software for any purpose. It is provided "as is" \r
+//     without express or implied warranty.\r
+// http://loki-lib.sourceforge.net/index.php?n=Main.License\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+#ifndef _ncvruntimetemplates_hpp_\r
+#define _ncvruntimetemplates_hpp_\r
+\r
+#include <stdarg.h>\r
+#include <vector>\r
+\r
+\r
+namespace Loki\r
+{\r
+    //==============================================================================\r
+    // class NullType\r
+    // Used as a placeholder for "no type here"\r
+    // Useful as an end marker in typelists \r
+    //==============================================================================\r
+\r
+    class NullType {};\r
+\r
+    //==============================================================================\r
+    // class template Typelist\r
+    // The building block of typelists of any length\r
+    // Use it through the LOKI_TYPELIST_NN macros\r
+    // Defines nested types:\r
+    //     Head (first element, a non-typelist type by convention)\r
+    //     Tail (second element, can be another typelist)\r
+    //==============================================================================\r
+\r
+    template <class T, class U>\r
+    struct Typelist\r
+    {\r
+        typedef T Head;\r
+        typedef U Tail;\r
+    };\r
+\r
+    //==============================================================================\r
+    // class template Int2Type\r
+    // Converts each integral constant into a unique type\r
+    // Invocation: Int2Type<v> where v is a compile-time constant integral\r
+    // Defines 'value', an enum that evaluates to v\r
+    //==============================================================================\r
+\r
+    template <int v>\r
+    struct Int2Type\r
+    {\r
+        enum { value = v };\r
+    };\r
+\r
+    namespace TL\r
+    {\r
+        //==============================================================================\r
+        // class template TypeAt\r
+        // Finds the type at a given index in a typelist\r
+        // Invocation (TList is a typelist and index is a compile-time integral \r
+        //     constant):\r
+        // TypeAt<TList, index>::Result\r
+        // returns the type in position 'index' in TList\r
+        // If you pass an out-of-bounds index, the result is a compile-time error\r
+        //==============================================================================\r
+\r
+        template <class TList, unsigned int index> struct TypeAt;\r
+\r
+        template <class Head, class Tail>\r
+        struct TypeAt<Typelist<Head, Tail>, 0>\r
+        {\r
+            typedef Head Result;\r
+        };\r
+\r
+        template <class Head, class Tail, unsigned int i>\r
+        struct TypeAt<Typelist<Head, Tail>, i>\r
+        {\r
+            typedef typename TypeAt<Tail, i - 1>::Result Result;\r
+        };\r
+    }\r
+}\r
+\r
+\r
+////////////////////////////////////////////////////////////////////////////////\r
+// Runtime boolean template instance dispatcher\r
+// Cyril Crassin <cyril.crassin@icare3d.org>\r
+// NVIDIA, 2010\r
+////////////////////////////////////////////////////////////////////////////////\r
+\r
+namespace NCVRuntimeTemplateBool\r
+{\r
+    //This struct is used to transform a list of parameters into template arguments\r
+    //The idea is to build a typelist containing the arguments\r
+    //and to pass this typelist to a user defined functor\r
+    template<typename TList, int NumArguments, class Func>\r
+    struct KernelCaller\r
+    {\r
+        //Convenience function used by the user\r
+        //Takes a variable argument list, transforms it into a list\r
+        static void call(Func &functor, int dummy, ...)\r
+        {\r
+            //Vector used to collect arguments\r
+            std::vector<int> templateParamList;\r
+\r
+            //Variable argument list manipulation\r
+            va_list listPointer;\r
+            va_start(listPointer, dummy);\r
+            //Collect parameters into the list\r
+            for(int i=0; i<NumArguments; i++)\r
+            {\r
+                int val = va_arg(listPointer, int);\r
+                templateParamList.push_back(val);\r
+            }\r
+            va_end(listPointer);\r
+\r
+            //Call the actual typelist building function\r
+            call(functor, templateParamList);\r
+        }\r
+\r
+        //Actual function called recursively to build a typelist based\r
+        //on a list of values\r
+        static void call( Func &functor, std::vector<int> &templateParamList)\r
+        {\r
+            //Get current parameter value in the list\r
+            int val = templateParamList[templateParamList.size() - 1];\r
+            templateParamList.pop_back();\r
+\r
+            //Select the compile time value to add into the typelist\r
+            //depending on the runtime variable and make recursive call. \r
+            //Both versions are really instantiated\r
+            if(val)\r
+            {\r
+                KernelCaller<\r
+                    Loki::Typelist<typename Loki::Int2Type<true>, TList >,\r
+                    NumArguments-1, Func >\r
+                    ::call(functor, templateParamList);\r
+            }\r
+            else\r
+            {\r
+                KernelCaller< \r
+                    Loki::Typelist<typename Loki::Int2Type<false>, TList >,\r
+                    NumArguments-1, Func >\r
+                    ::call(functor, templateParamList);\r
+            }\r
+        }\r
+    };\r
+\r
+    //Specialization for 0 value left in the list\r
+    //-> actual kernel functor call\r
+    template<class TList, class Func>\r
+    struct KernelCaller<TList, 0, Func>\r
+    {\r
+        static void call(Func &functor)\r
+        {\r
+            //Call to the functor's kernel call method\r
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved\r
+        }\r
+\r
+        static void call(Func &functor, std::vector<int> &templateParams)\r
+        {\r
+            functor.call(TList());\r
+        }\r
+    };\r
+}\r
+\r
+#endif //_ncvruntimetemplates_hpp_\r
index eb906e3..a7ba6ff 100644 (file)
@@ -71,6 +71,9 @@
     #include "npp_staging.h"\r
     #include "surf_key_point.h"\r
 \r
+       #include "nvidia/NCV.hpp"\r
+       #include "nvidia/NCVHaarObjectDetection.hpp"\r
+\r
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020\r
 #define NPP_MINIMUM_REQUIRED_VERSION 3216\r
 \r
diff --git a/samples/gpu/cascadeclassifier.cpp b/samples/gpu/cascadeclassifier.cpp
new file mode 100644 (file)
index 0000000..7f0a0e7
--- /dev/null
@@ -0,0 +1,193 @@
+// WARNING: this sample is under construction! Use it on your own risk.\r
+\r
+#include <opencv2/contrib/contrib.hpp>\r
+#include <opencv2/objdetect/objdetect.hpp>\r
+#include <opencv2/highgui/highgui.hpp>\r
+#include <opencv2/imgproc/imgproc.hpp>\r
+#include <opencv2/gpu/gpu.hpp>\r
+\r
+#include <iostream>\r
+#include <iomanip>\r
+#include <stdio.h>\r
+\r
+using namespace std;\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+\r
+void help()\r
+{\r
+    cout << "Usage: ./cascadeclassifier <cascade_file> <image_or_video_or_cameraid>\n"               \r
+            "Using OpenCV version " << CV_VERSION << endl << endl;\r
+}\r
+\r
+void DetectAndDraw(Mat& img, CascadeClassifier_GPU& cascade);\r
+\r
+String cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";\r
+String nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";\r
+\r
+\r
+\r
+template<class T> void convertAndReseize(const T& src, T& gray, T& resized, double scale = 2.0)\r
+{\r
+    if (src.channels() == 3)\r
+        cvtColor( src, gray, CV_BGR2GRAY );\r
+    else\r
+        gray = src;\r
+\r
+    Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));\r
+    if (scale != 1)\r
+        resize(gray, resized, sz);\r
+    else\r
+        resized = gray;\r
+}\r
+\r
+\r
+\r
+int main( int argc, const char** argv )\r
+{        \r
+    if (argc != 3)\r
+        return help(), -1;\r
+\r
+    if (cv::gpu::getCudaEnabledDeviceCount() == 0)\r
+        return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;\r
+\r
+    VideoCapture capture;\r
+     \r
+    string cascadeName = argv[1];\r
+    string inputName = argv[2];\r
+\r
+    cv::gpu::CascadeClassifier_GPU cascade_gpu;\r
+    if( !cascade_gpu.load( cascadeName ) )\r
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;\r
+\r
+    cv::CascadeClassifier cascade_cpu;\r
+    if( !cascade_cpu.load( cascadeName ) )\r
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;\r
+    \r
+    Mat image = imread( inputName);\r
+    if( image.empty() )\r
+        if (!capture.open(inputName))\r
+        {\r
+            int camid = 0;\r
+            sscanf(inputName.c_str(), "%d", &camid);\r
+            if(!capture.open(camid))\r
+                cout << "Can't open source" << endl;\r
+        }\r
+    \r
+    namedWindow( "result", 1 );        \r
+    Size fontSz = cv::getTextSize("T[]", FONT_HERSHEY_SIMPLEX, 1.0, 2, 0);\r
+\r
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;\r
+    vector<Rect> facesBuf_cpu;\r
+\r
+    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;    \r
+    \r
+    /* parameters */\r
+    bool useGPU = true;\r
+    double scale_factor = 2;\r
+\r
+    bool visualizeInPlace = false;   \r
+    bool findLargestObject = false;    \r
+\r
+    printf("\t<space> - toggle GPU/CPU\n");\r
+    printf("\tL       - toggle lagest faces\n");\r
+    printf("\tV       - toggle visualisation in-place (for GPU only)\n");\r
+    printf("\t1/q     - inc/dec scale\n");\r
+        \r
+    int detections_num;\r
+    for(;;)\r
+    {               \r
+        if( capture.isOpened() )\r
+        {\r
+            capture >> frame;                            \r
+            if( frame.empty())\r
+                break;\r
+        }\r
+\r
+        (image.empty() ? frame : image).copyTo(frame_cpu);\r
+        frame_gpu.upload( image.empty() ? frame : image);\r
+        \r
+        convertAndReseize(frame_gpu, gray_gpu, resized_gpu, scale_factor);\r
+        convertAndReseize(frame_cpu, gray_cpu, resized_cpu, scale_factor);\r
+\r
+        cv::TickMeter tm;\r
+        tm.start();      \r
+\r
+        if (useGPU)\r
+        {\r
+            cascade_gpu.visualizeInPlace = visualizeInPlace;   \r
+            cascade_gpu.findLargestObject = findLargestObject;    \r
+\r
+            detections_num = cascade_gpu.detectMultiScale( resized_gpu, facesBuf_gpu ); \r
+            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);\r
+        \r
+        }\r
+        else /* so use CPU */\r
+        {   \r
+            Size minSize = cascade_gpu.getClassifierSize();\r
+            if (findLargestObject)\r
+            {                \r
+                float ratio = (float)std::min(frame.cols / minSize.width, frame.rows / minSize.height);\r
+                ratio = std::max(ratio / 2.5f, 1.f);\r
+                minSize = Size(cvRound(minSize.width * ratio), cvRound(minSize.height * ratio));                \r
+            }\r
+            \r
+            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2, 4, (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, minSize);                            \r
+            detections_num = (int)facesBuf_cpu.size();\r
+        }\r
+\r
+        tm.stop();\r
+        printf( "detection time = %g ms\n", tm.getTimeMilli() );\r
+\r
+        if (useGPU)\r
+            resized_gpu.download(resized_cpu);\r
+\r
+        if (!visualizeInPlace || !useGPU)\r
+            if (detections_num)\r
+            {\r
+                Rect* faces = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];                \r
+                for(int i = 0; i < detections_num; ++i)                \r
+                    cv::rectangle(resized_cpu, faces[i], Scalar(255));            \r
+            }\r
+        \r
+        Point text_pos(5, 25);\r
+        int offs = fontSz.height + 5;\r
+        Scalar color = CV_RGB(255, 0, 0);\r
+\r
+\r
+        cv::cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);\r
+\r
+        char buf[4096];\r
+        sprintf(buf, "%s, FPS = %0.3g", useGPU ? "GPU" : "CPU", 1.0/tm.getTimeSec());                       \r
+        putText(frameDisp, buf, text_pos, FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+        sprintf(buf, "scale = %0.3g, [%d*scale x %d*scale]", scale_factor, frame.cols, frame.rows);                       \r
+        putText(frameDisp, buf, text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+        putText(frameDisp, "Hotkeys: space, 1, Q, L, V, Esc", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+        if (findLargestObject)\r
+            putText(frameDisp, "FindLargestObject", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+        if (visualizeInPlace && useGPU)\r
+            putText(frameDisp, "VisualizeInPlace", text_pos+Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);\r
+\r
+        cv::imshow( "result", frameDisp);\r
+\r
+        int key = waitKey( 5 );\r
+        if( key == 27)\r
+            break;\r
+\r
+        switch (key)\r
+        {\r
+        case (int)' ':  useGPU = !useGPU;  printf("Using %s\n", useGPU ? "GPU" : "CPU");break;\r
+        case (int)'v':  case (int)'V': visualizeInPlace = !visualizeInPlace; printf("VisualizeInPlace = %d\n", visualizeInPlace); break;\r
+        case (int)'l':  case (int)'L': findLargestObject = !findLargestObject;  printf("FindLargestObject = %d\n", findLargestObject); break;\r
+        case (int)'1':  scale_factor*=1.05; printf("Scale factor = %g\n", scale_factor); break;\r
+        case (int)'q':  case (int)'Q':scale_factor/=1.05; printf("Scale factor = %g\n", scale_factor); break;\r
+        }\r
+       \r
+    }    \r
+    return 0;\r
+}\r
+\r
+\r
+\r