DevMem2D_<int4> objects,\r
unsigned int* classified);\r
\r
- int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
+ void classifyStumpFixed(const DevMem2Db& mstages,\r
+ const int nstages,\r
+ const DevMem2Di& mnodes,\r
+ const DevMem2Df& mleaves,\r
+ const DevMem2Di& msubsets,\r
+ const DevMem2Db& mfeatures,\r
+ const int workWidth,\r
+ const int workHeight,\r
+ const int clWidth,\r
+ const int clHeight,\r
+ float scale,\r
+ int step,\r
+ int subsetSize,\r
+ DevMem2D_<int4> objects,\r
+ unsigned int* classified,\r
+ const int maxX);\r
+\r
+ int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
void bindIntegral(DevMem2Di integral);\r
void unbindIntegral();\r
}\r
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);\r
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);\r
// used for debug\r
- // candidates.setTo(cv::Scalar::all(0));\r
- // objects.setTo(cv::Scalar::all(0));\r
+ candidates.setTo(cv::Scalar::all(0));\r
+ objects.setTo(cv::Scalar::all(0));\r
if (maxObjectSize == cv::Size())\r
maxObjectSize = image.size();\r
\r
unsigned int* dclassified;\r
cudaMalloc(&dclassified, sizeof(int));\r
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);\r
- int step;\r
+ int step = 2;\r
cv::gpu::device::lbp::bindIntegral(integral);\r
\r
- for( double factor = 1; ; factor *= scaleFactor )\r
+ cv::Size scaledImageSize(image.cols, image.rows);\r
+ cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+ cv::Size windowSize(NxM.width, NxM.height);\r
+\r
+ double factor = 1;\r
+\r
+ for (; processingRectSize.width / step >= 256;)\r
{\r
+ // std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;\r
// if (factor > 2.0) break;\r
- cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
- cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
- cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+ if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )\r
+ break;\r
+\r
+ if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )\r
+ break;\r
+\r
+ // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )\r
+ // continue;\r
+\r
+ GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));\r
+ GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));\r
+ GpuMat currBuff = integralBuffer;\r
+\r
+ cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);\r
+ cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);\r
+\r
+ step = (factor <= 2.) + 1;\r
+\r
+ cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
+ processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width);\r
\r
+ factor *= scaleFactor;\r
+ windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
+ scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
+ processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+ }\r
+\r
+ for (; /*processingRectSize.width / step >= 128*/;)\r
+ {\r
+ // std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;\r
+ // if (factor > 2.0) break;\r
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )\r
break;\r
\r
\r
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);\r
+\r
+ factor *= scaleFactor;\r
+ windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
+ scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
+ processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
}\r
\r
cv::gpu::device::lbp::unbindIntegral();\r
if (groupThreshold <= 0 || objects.empty())\r
return 0;\r
- cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);\r
+ cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
+ // std::cout << "!!! CLASSIFIED " << *classified << std::endl;\r
+ cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified);\r
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
cudaSafeCall( cudaDeviceSynchronize() );\r
step = *classified;\r
struct LBP
{
- __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
- __device__ __forceinline__ LBP() {}
+ __host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
+ __host__ __device__ __forceinline__ LBP() {}
- //feature as uchar x, y - left top, z,w - right bottom
- __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
+ __device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
- anchors[1] = tex2D(tintegral, tx + featurez, ty);
+ anchors[1] = tex2D(tintegral, tx + fw, ty);
anchors[0] -= anchors[1];
- anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
+ anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2];
- anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+ anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
- anchors[4] = tex2D(tintegral, tx + featurez, ty);
+ anchors[4] = tex2D(tintegral, tx + fw, ty);
anchors[3] -= anchors[4];
- anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
+ anchors[5] = tex2D(tintegral, tx + fw * 2, ty);
anchors[4] -= anchors[5];
- anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
+ anchors[5] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
- anchors[7] = tex2D(tintegral, tx + featurez, ty);
+ anchors[7] = tex2D(tintegral, tx + fw, ty);
anchors[6] -= anchors[7];
- anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
+ anchors[8] = tex2D(tintegral, tx + fw * 2, ty);
anchors[7] -= anchors[8];
- anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
+ anchors[8] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
- anchors[1] = tex2D(tintegral, tx + featurez, ty);
+ anchors[1] = tex2D(tintegral, tx + fw, ty);
anchors[0] -= anchors[1];
- anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
+ anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2];
- anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+ anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
- __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
- /* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
- const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
+ struct Classifier
{
- int x = threadIdx.x * step;
- int y = blockIdx.x * step;
+ __host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
+ const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
+ : stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
+ scale(_scale), step(_step), subsetSize(_subsetSize){}
- int current_node = 0;
- int current_leave = 0;
-
- LBP evaluator;
- for (int s = 0; s < nstages; s++ )
+ __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
{
- float sum = 0;
- Stage stage = stages[s];
- for (int t = 0; t < stage.ntrees; t++)
+ int current_node = 0;
+ int current_leave = 0;
+
+ for (int s = 0; s < nstages; ++s)
{
- ClNode node = nodes[current_node];
-
- uchar4 feature = features[node.featureIdx];
- int shift;
- int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
- int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
- sum += leaves[idx];
- current_node += 1;
- current_leave += 2;
+ float sum = 0;
+ Stage stage = stages[s];
+ for (int t = 0; t < stage.ntrees; t++)
+ {
+ ClNode node = nodes[current_node];
+ uchar4 feature = features[node.featureIdx];
+
+ int shift;
+ int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
+ int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+ sum += leaves[idx];
+
+ current_node += 1;
+ current_leave += 2;
+ }
+
+ if (sum < stage.threshold)
+ return;
}
- if (sum < stage.threshold)
- return;
- }
- int4 rect;
- rect.x = roundf(x * scale);
- rect.y = roundf(y * scale);
- rect.z = clWidth;
- rect.w = clHeight;
+ int4 rect;
+ rect.x = roundf(x * scale);
+ rect.y = roundf(y * scale);
+ rect.z = clWidth;
+ rect.w = clHeight;
+
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
- int res = __atomicInc(n, 100U);
+ int res = __atomicInc(n, maxN);
#else
- int res = atomicInc(n, 100U);
+ int res = atomicInc(n, maxN);
#endif
- objects(0, res) = rect;
+ objects(0, res) = rect;
+ }
+
+ const Stage* stages;
+ const ClNode* nodes;
+ const float* leaves;
+ const int* subsets;
+ const uchar4* features;
+
+ const int nstages;
+ const int clWidth;
+ const int clHeight;
+ const float scale;
+ const int step;
+ const int subsetSize;
+ const LBP evaluator;
+ };
+
+ __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n)
+ {
+ int x = threadIdx.x * classifier.step;
+ int y = blockIdx.x * classifier.step;
+
+ classifier(y, x, objects, maxN, n);
+ }
+
+ __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
+ {
+ int x = threadIdx.x * lines * classifier.step;
+ if (x >= maxX) return;
+
+ int y = blockIdx.x * classifier.step / lines;
+
+ classifier(y, x, objects, maxN, n);
}
template<typename Pr>
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
- using cv::gpu::device::VecTraits;
unsigned int tid = threadIdx.x;
extern __shared__ int sbuff[];
int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
- __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
- __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
- __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
- __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+ __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+ __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+ __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+ __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#else
- atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
- atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
- atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
- atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+ atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+ atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+ atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+ atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#endif
+ __syncthreads();
labels[tid] = 0;
+
__syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n);
#else
atomicInc((unsigned int*)labels + cls, n);
#endif
+ __syncthreads();
*nclasses = 0;
int active = labels[tid];
r1[1] = saturate_cast<int>(r1[1] * s);
r1[2] = saturate_cast<int>(r1[2] * s);
r1[3] = saturate_cast<int>(r1[3] * s);
+ }
+ __syncthreads();
- int n1 = active;
- __syncthreads();
- unsigned int j = 0;
- if( active > groupThreshold )
- {
- for (j = 0; j < n; j++)
- {
- int n2 = labels[j];
- if(!n2 || j == tid || n2 <= groupThreshold )
- continue;
-
- int* r2 = rrects + j * 4;
-
- int dx = saturate_cast<int>( r2[2] * grouping_eps );
- int dy = saturate_cast<int>( r2[3] * grouping_eps );
-
- if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
- r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
- (n2 > max(3, n1) || n1 < 3) )
- break;
- }
- if( j == n)
- {
+ if (active && active >= groupThreshold)
+ {
+ int* r1 = rrects + tid * 4;
+ int4 r_out;
+ r_out.x = r1[0];
+ r_out.y = r1[1];
+ r_out.z = r1[2];
+ r_out.w = r1[3];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
- objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+ objects[__atomicInc(nclasses, n)] = r_out;
#else
- objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+ int aidx = atomicInc(nclasses, n);
+ objects[aidx] = r_out;
#endif
- }
- }
}
}
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
- /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
- DevMem2D_<int4> objects, unsigned int* classified)
+ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
+ {
+ int blocks = ceilf(workHeight / (float)step);
+ int threads = ceilf(workWidth / (float)step);
+
+ Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
+ lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
+ }
+
+ void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
+ int maxX)
{
+ const int THREADS_BLOCK = 256;
int blocks = ceilf(workHeight / (float)step);
int threads = ceilf(workWidth / (float)step);
- Stage* stages = (Stage*)(mstages.ptr());
- ClNode* nodes = (ClNode*)(mnodes.ptr());
- const float* leaves = mleaves.ptr();
- const int* subsets = msubsets.ptr();
- const uchar4* features = (uchar4*)(mfeatures.ptr());
- lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
- workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
+ Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
+ int lines = divUp(threads, THREADS_BLOCK);
+ lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
}
- int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+ int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
- int threads = candidates.cols;
+ int threads = ncandidates;
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
- disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+ disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
return 0;
}
}