From 965109228dbc526bb41e5446b8b110379c96e3c7 Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Thu, 12 Jul 2012 08:50:36 +0000 Subject: [PATCH] added delobal memory version --- modules/gpu/src/cascadeclassifier.cpp | 70 ++++++++++++----------- modules/gpu/src/cuda/lbp.cu | 104 +++++++++++++++++++++++++++++----- 2 files changed, 127 insertions(+), 47 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index f8e585b..f58a2e3 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -298,37 +298,39 @@ namespace cv { namespace gpu { namespace device { namespace lbp { - void classifyStump(const DevMem2Db& mstages, - const int nstages, - const DevMem2Di& mnodes, - const DevMem2Df& mleaves, - const DevMem2Di& msubsets, - const DevMem2Db& mfeatures, - const int workWidth, - const int workHeight, - const int clWidth, - const int clHeight, - float scale, - int step, - int subsetSize, - DevMem2D_ objects, - unsigned int* classified); - - void classifyStumpFixed(const DevMem2Db& mstages, - const int nstages, - const DevMem2Di& mnodes, - const DevMem2Df& mleaves, - const DevMem2Di& msubsets, - const DevMem2Db& mfeatures, - const int workWidth, - const int workHeight, - const int clWidth, - const int clHeight, - float scale, - int step, - int subsetSize, - DevMem2D_ objects, - unsigned int* classified); + // void classifyStump(const DevMem2Db& mstages, + // const int nstages, + // const DevMem2Di& mnodes, + // const DevMem2Df& mleaves, + // const DevMem2Di& msubsets, + // const DevMem2Db& mfeatures, + // const int workWidth, + // const int workHeight, + // const int clWidth, + // const int clHeight, + // float scale, + // int step, + // int subsetSize, + // DevMem2D_ objects, + // unsigned int* classified); + + void classifyStumpFixed(const DevMem2Di& integral, + const int integralPitch, + const DevMem2Db& mstages, + const int nstages, + const DevMem2Di& mnodes, + const DevMem2Df& mleaves, + const DevMem2Di& msubsets, + const DevMem2Db& mfeatures, + const int workWidth, + const int workHeight, + const int clWidth, + const int clHeight, + float scale, + int step, + int subsetSize, + DevMem2D_ objects, + unsigned int* classified); int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void bindIntegral(DevMem2Di integral); @@ -365,7 +367,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp cudaMalloc(&dclassified, sizeof(int)); cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice); int step = 2; - cv::gpu::device::lbp::bindIntegral(integral); + // cv::gpu::device::lbp::bindIntegral(integral); cv::Size scaledImageSize(image.cols, image.rows); cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); @@ -393,7 +395,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp step = (factor <= 2.) + 1; - cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, + cv::gpu::device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); factor *= scaleFactor; @@ -402,7 +404,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); } - cv::gpu::device::lbp::unbindIntegral(); + // cv::gpu::device::lbp::unbindIntegral(); if (groupThreshold <= 0 || objects.empty()) return 0; cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index b8a6df8..030cde3 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -56,6 +56,80 @@ namespace cv { namespace gpu { namespace device __host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;} __host__ __device__ __forceinline__ LBP() {} + // for integral matrix stored in the global memory + __device__ __forceinline__ int operator() (const int* integral, const int pitch, int ty, int tx, int fh, int fw, int& shift) const + { + int anchors[9]; + + anchors[0] = integral[ty * pitch + tx]; + anchors[1] = integral[ty * pitch + tx + fw]; + anchors[0] -= anchors[1]; + anchors[2] = integral[ty * pitch + tx + fw * 2]; + anchors[1] -= anchors[2]; + anchors[2] -= integral[ty * pitch + tx + fw * 3]; + + ty += fh; + anchors[3] = integral[ty * pitch + tx]; + anchors[4] = integral[ty * pitch + tx + fw]; + anchors[3] -= anchors[4]; + anchors[5] = integral[ty * pitch + tx + fw * 2]; + anchors[4] -= anchors[5]; + anchors[5] -= integral[ty * pitch + tx + fw * 3]; + + anchors[0] -= anchors[3]; + anchors[1] -= anchors[4]; + anchors[2] -= anchors[5]; + // 0 - 2 contains s0 - s2 + + ty += fh; + anchors[6] = integral[ty * pitch + tx]; + anchors[7] = integral[ty * pitch + tx + fw]; + anchors[6] -= anchors[7]; + anchors[8] = integral[ty * pitch + tx + fw * 2]; + anchors[7] -= anchors[8]; + anchors[8] -= integral[ty * pitch + tx + fw * 3]; + + anchors[3] -= anchors[6]; + anchors[4] -= anchors[7]; + anchors[5] -= anchors[8]; + // 3 - 5 contains s3 - s5 + + anchors[0] -= anchors[4]; + anchors[1] -= anchors[4]; + anchors[2] -= anchors[4]; + anchors[3] -= anchors[4]; + anchors[5] -= anchors[4]; + + int response = (~(anchors[0] >> 31)) & 4; + response |= (~(anchors[1] >> 31)) & 2;; + response |= (~(anchors[2] >> 31)) & 1; + + shift = (~(anchors[5] >> 31)) & 16; + shift |= (~(anchors[3] >> 31)) & 1; + + ty += fh; + anchors[0] = integral[ty * pitch + tx]; + anchors[1] = integral[ty * pitch + tx + fw]; + anchors[0] -= anchors[1]; + anchors[2] = integral[ty * pitch + tx + fw * 2]; + anchors[1] -= anchors[2]; + anchors[2] -= integral[ty * pitch + tx + fw * 3]; + + anchors[6] -= anchors[0]; + anchors[7] -= anchors[1]; + anchors[8] -= anchors[2]; + // 0 -2 contains s6 - s8 + + anchors[6] -= anchors[4]; + anchors[7] -= anchors[4]; + anchors[8] -= anchors[4]; + + shift |= (~(anchors[6] >> 31)) & 2; + shift |= (~(anchors[7] >> 31)) & 4; + shift |= (~(anchors[8] >> 31)) & 8; + return response; + } + // for texture fetchrd integral matrix __device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const { int anchors[9]; @@ -143,9 +217,9 @@ namespace cv { namespace gpu { namespace device struct Classifier { - __host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features, + __host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features, const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize) - : stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight), + : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){} __device__ __forceinline__ void operator() (int y, int x, DevMem2D_ objects, const unsigned int maxN, unsigned int* n) const @@ -163,7 +237,8 @@ namespace cv { namespace gpu { namespace device uchar4 feature = features[node.featureIdx]; int shift; - int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); + // int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); + int c = evaluator(integral, pitch, (y + feature.y), x + feature.x, feature.w, feature.z, shift); int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; sum += leaves[idx]; @@ -189,6 +264,9 @@ namespace cv { namespace gpu { namespace device objects(0, res) = rect; } + const int* integral; + const int pitch; + const Stage* stages; const ClNode* nodes; const float* leaves; @@ -292,24 +370,24 @@ namespace cv { namespace gpu { namespace device } } - void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, - const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) - { - int blocks = ceilf(workHeight / (float)step); - int threads = ceilf(workWidth / (float)step); + // void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, + // const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) + // { + // int blocks = ceilf(workHeight / (float)step); + // int threads = ceilf(workWidth / (float)step); - Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); - lbp_classify_stump<<>>(clr, objects, objects.cols, classified); - } + // Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); + // lbp_classify_stump<<>>(clr, objects, objects.cols, classified); + // } - void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, + void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) { const int THREADS_BLOCK = 256; int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); int blocks = divUp(work_amount, THREADS_BLOCK); - Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); + Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); lbp_classify_stump<<>>(clr, objects, objects.cols, classified, workWidth >> 1); } -- 2.7.4