From 2aacff4c39538a421338080158f00871f573aafa Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Thu, 12 Jul 2012 07:11:26 +0000 Subject: [PATCH] swithed to the fixed size thread block --- modules/gpu/src/cascadeclassifier.cpp | 40 ++--------------------------------- modules/gpu/src/cuda/lbp.cu | 21 ++++++++---------- 2 files changed, 11 insertions(+), 50 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index bf13a98..f8e585b 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -97,7 +97,6 @@ void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size frame) Ncv32u bufSize; ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); - // printf("HERE!!!!!!!%d\n", bufSize); integralBuffer.create(1, bufSize, CV_8UC1); } } @@ -329,8 +328,7 @@ namespace cv { namespace gpu { namespace device int step, int subsetSize, DevMem2D_ objects, - unsigned int* classified, - const int maxX); + unsigned int* classified); int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void bindIntegral(DevMem2Di integral); @@ -375,10 +373,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp double factor = 1; - for (; processingRectSize.width / step >= 256;) + for (; ;) { - // std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl; - // if (factor > 2.0) break; if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) break; @@ -398,37 +394,6 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp step = (factor <= 2.) + 1; cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, - processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width); - - factor *= scaleFactor; - windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); - scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor )); - processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); - } - - for (; /*processingRectSize.width / step >= 128*/;) - { - // std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl; - // if (factor > 2.0) break; - if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) - break; - - if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ) - break; - - // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height ) - // continue; - - GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height)); - GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1)); - GpuMat currBuff = integralBuffer; - - cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR); - cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff); - - step = (factor <= 2.) + 1; - - cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); factor *= scaleFactor; @@ -441,7 +406,6 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp if (groupThreshold <= 0 || objects.empty()) return 0; cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); - // std::cout << "!!! CLASSIFIED " << *classified << std::endl; cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified); cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index eab41b5..b8a6df8 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -212,14 +212,13 @@ namespace cv { namespace gpu { namespace device classifier(y, x, objects, maxN, n); } - __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_ objects, const unsigned int maxN, unsigned int* n, int lines, int maxX) + __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_ objects, const unsigned int maxN, unsigned int* n, int maxX) { - int x = threadIdx.x * lines * classifier.step; - if (x >= maxX) return; + int ftid = blockIdx.x * blockDim.x + threadIdx.x; + int y = ftid / maxX; + int x = ftid - y * maxX; - int y = blockIdx.x * classifier.step / lines; - - classifier(y, x, objects, maxN, n); + classifier(y * classifier.step, x * classifier.step, objects, maxN, n); } template @@ -304,16 +303,14 @@ namespace cv { namespace gpu { namespace device } void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, - const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified, - int maxX) + const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) { const int THREADS_BLOCK = 256; - int blocks = ceilf(workHeight / (float)step); - int threads = ceilf(workWidth / (float)step); + int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); + int blocks = divUp(work_amount, THREADS_BLOCK); Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); - int lines = divUp(threads, THREADS_BLOCK); - lbp_classify_stump<<>>(clr, objects, objects.cols, classified, lines, maxX); + lbp_classify_stump<<>>(clr, objects, objects.cols, classified, workWidth >> 1); } int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) -- 2.7.4