GpuMat resuzeBuffer;\r
\r
GpuMat candidates;\r
+ static const int integralFactor = 4;\r
};\r
\r
////////////////////////////////// SURF //////////////////////////////////////////\r
bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; }\r
bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; }\r
Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); }\r
-void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}\r
+void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}\r
\r
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,\r
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}\r
{\r
resuzeBuffer.create(frame, CV_8UC1);\r
\r
- integral.create(frame.height + 1, frame.width + 1, CV_32SC1);\r
+ integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);\r
NcvSize32u roiSize;\r
roiSize.width = frame.width;\r
roiSize.height = frame.height;\r
DevMem2D_<int4> objects,\r
unsigned int* classified);\r
\r
+ void classifyPyramid(int frameW,\r
+ int frameH,\r
+ int windowW,\r
+ int windowH,\r
+ float initalScale,\r
+ float factor,\r
+ int total,\r
+ const DevMem2Db& mstages,\r
+ const int nstages,\r
+ const DevMem2Di& mnodes,\r
+ const DevMem2Df& mleaves,\r
+ const DevMem2Di& msubsets,\r
+ const DevMem2Db& mfeatures,\r
+ const int subsetSize,\r
+ DevMem2D_<int4> objects,\r
+ unsigned int* classified,\r
+ DevMem2Di integral);\r
+\r
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
void bindIntegral(DevMem2Di integral);\r
void unbindIntegral();\r
}\r
}}}\r
\r
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,\r
- double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)\r
+cv::Size operator -(const cv::Size& a, const cv::Size& b)\r
+{\r
+ return cv::Size(a.width - b.width, a.height - b.height);\r
+}\r
+\r
+cv::Size operator +(const cv::Size& a, const int& i)\r
+{\r
+ return cv::Size(a.width + i, a.height + i);\r
+}\r
+\r
+cv::Size operator *(const cv::Size& a, const float& f)\r
+{\r
+ return cv::Size(cvRound(a.width * f), cvRound(a.height * f));\r
+}\r
+\r
+cv::Size operator /(const cv::Size& a, const float& f)\r
+{\r
+ return cv::Size(cvRound(a.width / f), cvRound(a.height / f));\r
+}\r
+\r
+bool operator <=(const cv::Size& a, const cv::Size& b)\r
+{\r
+ return a.width <= b.width && a.height <= b.width;\r
+}\r
+\r
+struct PyrLavel\r
+{\r
+ PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)\r
+ {\r
+ scale = pow(_scale, order);\r
+ sFrame = frame / scale;\r
+ workArea = sFrame - window + 1;\r
+ sWindow = window * scale;\r
+ }\r
+\r
+ bool isFeasible(cv::Size maxObj)\r
+ {\r
+ return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;\r
+ }\r
+\r
+ PyrLavel next(float factor, cv::Size frame, cv::Size window)\r
+ {\r
+ return PyrLavel(order + 1, factor, frame, window);\r
+ }\r
+\r
+ int order;\r
+ float scale;\r
+ cv::Size sFrame;\r
+ cv::Size workArea;\r
+ cv::Size sWindow;\r
+};\r
+\r
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)\r
{\r
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);\r
\r
// used for debug\r
// candidates.setTo(cv::Scalar::all(0));\r
// objects.setTo(cv::Scalar::all(0));\r
+\r
if (maxObjectSize == cv::Size())\r
maxObjectSize = image.size();\r
\r
GpuMat dclassified(1, 1, CV_32S);\r
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );\r
\r
- // cv::gpu::device::lbp::bindIntegral(integral);\r
+ PyrLavel level(0, 1.0f, image.size(), NxM);\r
\r
- Size scaledImageSize(image.cols, image.rows);\r
- Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
- Size windowSize(NxM.width, NxM.height);\r
-\r
- float factor = 1;\r
-\r
- for (;;)\r
+ while (level.isFeasible(maxObjectSize))\r
{\r
- if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )\r
- break;\r
+ int acc = level.sFrame.width + 1;\r
+ float iniScale = level.scale;\r
+ cv::Size area = level.workArea;\r
+ float step = (float)(1 + (level.scale <= 2.f));\r
\r
- if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )\r
- break;\r
+ int total = 0, prev = 0;\r
\r
- // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )\r
- // continue;\r
+ while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))\r
+ {\r
+ // create sutable matrix headers\r
+ GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));\r
+ GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));\r
+ GpuMat buff = integralBuffer;\r
\r
- GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));\r
- GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));\r
- GpuMat currBuff = integralBuffer;\r
+ // generate integral for scale\r
+ gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);\r
+ gpu::integralBuffered(src, sint, buff);\r
\r
- gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);\r
- gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);\r
+ total += cvCeil(area.width / step) * cvCeil(area.height / step);\r
+ // std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl;\r
\r
- int step = factor <= 2.f ? 2 : 1;\r
+ // increment pyr lavel\r
+ level = level.next(scaleFactor, image.size(), NxM);\r
+ area = level.workArea;\r
\r
- device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
- processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());\r
+ step = (float)(1 + (level.scale <= 2.f));\r
+ prev = acc;\r
+ acc += level.sFrame.width + 1;\r
+ }\r
\r
- factor *= scaleFactor;\r
- windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
- scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
- processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+ device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,\r
+ leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);\r
}\r
\r
- // cv::gpu::device::lbp::unbindIntegral();\r
if (groupThreshold <= 0 || objects.empty())\r
return 0;\r
\r
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );\r
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());\r
\r
+ // candidates.copyTo(objects);\r
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );\r
cudaSafeCall( cudaDeviceSynchronize() );\r
+ // std::cout << classified << " !!!!!!!!!!" << std::endl;\r
\r
return classified;\r
}\r
struct Classifier
{
- __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
- const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
- : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
- clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
+ __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
+ const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
+ : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
+ clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
{
rect.z = clWidth;
rect.w = clHeight;
-#if (__CUDA_ARCH__ < 120)
- int res = __atomicInc(n, maxN);
-#else
- int res = atomicInc(n, maxN);
-#endif
+ int res = Emulation::smem::atomicInc(n, maxN);
objects(0, res) = rect;
}
__syncthreads();
int cls = labels[tid];
-#if (__CUDA_ARCH__ < 120)
- __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
- __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
- __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
- __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
-#else
- atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
- atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
- atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
- atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
-#endif
+ Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+ Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+ Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+ Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
+
__syncthreads();
labels[tid] = 0;
__syncthreads();
-#if (__CUDA_ARCH__ < 120)
- __atomicInc((unsigned int*)labels + cls, n);
-#else
- atomicInc((unsigned int*)labels + cls, n);
-#endif
+ Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
+
__syncthreads();
*nclasses = 0;
if (active && active >= groupThreshold)
{
- int* r1 = rrects + tid * 4;
- int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+ int* r1 = rrects + tid * 4;
+ int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
-#if (__CUDA_ARCH__ < 120)
- objects[__atomicInc(nclasses, n)] = r_out;
-#else
- int aidx = atomicInc(nclasses, n);
+ int aidx = Emulation::smem::atomicInc(nclasses, n);
objects[aidx] = r_out;
-#endif
}
}
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
- {
- Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
- (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
+ {
+ Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
+ (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
- int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
+ int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
- int block = 256;
+ int block = 256;
int grid = divUp(total, block);
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
- cudaSafeCall( cudaGetLastError() );
+ cudaSafeCall( cudaGetLastError() );
}
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
int block = ncandidates;
int smem = block * ( sizeof(int) + sizeof(int4) );
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
- cudaSafeCall( cudaGetLastError() );
+ cudaSafeCall( cudaGetLastError() );
+ }
+
+ struct Cascade
+ {
+ __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
+ const int* _subsets, const uchar4* _features, int _subsetSize)
+
+ : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
+
+ __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
+ {
+ int current_node = 0;
+ int current_leave = 0;
+
+ for (int s = 0; s < nstages; ++s)
+ {
+ float sum = 0;
+ Stage stage = stages[s];
+ for (int t = 0; t < stage.ntrees; t++)
+ {
+ ClNode node = nodes[current_node];
+ uchar4 feature = features[node.featureIdx];
+
+ int shift;
+ int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
+ int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+ sum += leaves[idx];
+
+ current_node += 1;
+ current_leave += 2;
+ }
+
+ if (sum < stage.threshold)
+ return false;
+ }
+
+ return true;
+ }
+
+ const Stage* stages;
+ const int nstages;
+
+ const ClNode* nodes;
+ const float* leaves;
+ const int* subsets;
+ const uchar4* features;
+
+ const int subsetSize;
+ const LBP evaluator;
+ };
+
+ // stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
+ __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
+ const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
+ {
+ int ftid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (ftid >= workAmount ) return;
+
+ int sum = 0;
+ // float scale = 1.0f;
+ float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
+ int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
+ int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
+
+ // if (!ftid)
+ // printf("!!!!: %d %d", w, h);
+
+ int framTid = ftid;
+ int i = 0;
+
+ while (1)
+ {
+ if (framTid < (w - 1) * (h - 1)) break;
+ i++;
+ sum += __float2int_rn(frameW / scale) + 1;
+ framTid -= w * h;
+ scale *= factor;
+ stepShift = (scale <= 2.f) ? 2.0 : 1.0;
+ int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
+ int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
+ }
+
+ int y = (framTid / w);
+ int x = (framTid - y * w) * stepShift;
+ y *= stepShift;
+ x += sum;
+
+ // if (i == 2)
+ // printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
+
+ if (cascade(y, x, integral, pitch))
+ {
+ int4 rect;
+ rect.x = roundf( (x - sum) * scale);
+ rect.y = roundf(y * scale);
+ rect.z = roundf(windowW * scale);
+ rect.w = roundf(windowH * scale);
+
+ if (rect.x > frameW || rect.y > frameH) return;
+ // printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
+
+ // printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
+
+ int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
+ objects(0, res) = rect;
+
+ }
+ }
+
+ void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
+ const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+ const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
+ {
+ const int block = 256;
+ int grid = divUp(workAmount, block);
+ Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
+ lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
}
}
}}}
\ No newline at end of file
#define OPENCV_GPU_EMULATION_HPP_\r
\r
#include "warp_reduce.hpp"\r
+#include <stdio.h>\r
\r
namespace cv { namespace gpu { namespace device\r
{\r
struct Emulation\r
{\r
- template<int CTA_SIZE>\r
+ template<int CTA_SIZE>\r
static __forceinline__ __device__ int Ballot(int predicate)\r
{\r
-#if (__CUDA_ARCH__ >= 200) \r
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)\r
return __ballot(predicate);\r
#else\r
- __shared__ volatile int cta_buffer[CTA_SIZE]\r
+ __shared__ volatile int cta_buffer[CTA_SIZE];\r
\r
int tid = threadIdx.x;\r
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
#endif\r
}\r
\r
- struct smem\r
- {\r
- enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };\r
-
- template<typename T>
- static __device__ __forceinline__ T atomicInc(T* address, T val)
- {
-#if (__CUDA_ARCH__ < 120)
-
-#else
-
-#endif
-
- }
-
- template<typename T>
- static __device__ __forceinline__ void atomicAdd(T* address, T val)
- {
-#if (__CUDA_ARCH__ < 120)
-
-#else
-
-#endif
- }
-
- template<typename T>
- __device__ __forceinline__ T __atomicMin(T* address, T val)
- {
-#if (__CUDA_ARCH__ < 120)
-
-#else
-
-#endif
- }\r
- };\r
+ struct smem\r
+ {\r
+ enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };\r
+\r
+ template<typename T>\r
+ static __device__ __forceinline__ T atomicInc(T* address, T val)\r
+ {\r
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)\r
+ T count;\r
+ unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);\r
+ do\r
+ {\r
+ count = *address & TAG_MASK;\r
+ count = tag | (count + 1);\r
+ *address = count;\r
+ } while (*address != count);\r
+\r
+ return (count & TAG_MASK) - 1;\r
+#else\r
+ return ::atomicInc(address, val);\r
+#endif\r
+ }\r
+\r
+ template<typename T>\r
+ static __device__ __forceinline__ void atomicAdd(T* address, T val)\r
+ {\r
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)\r
+ T count;\r
+ unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);\r
+ do\r
+ {\r
+ count = *address & TAG_MASK;\r
+ count = tag | (count + val);\r
+ *address = count;\r
+ } while (*address != count);\r
+#else\r
+ ::atomicAdd(address, val);\r
+#endif\r
+ }\r
+\r
+ template<typename T>\r
+ static __device__ __forceinline__ T atomicMin(T* address, T val)\r
+ {\r
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)\r
+ T count = min(*address, val);\r
+ do\r
+ {\r
+ *address = count;\r
+ } while (*address > count);\r
+\r
+ return count;\r
+#else\r
+ return ::atomicMin(address, val);\r
+#endif\r
+ }\r
+ };\r
};\r
}}} // namespace cv { namespace gpu { namespace device\r
\r
#define __OPENCV_GPU_DEVICE_LBP_HPP_
#include "internal_shared.hpp"
+#include <opencv2/gpu/device/emulation.hpp>
namespace cv { namespace gpu { namespace device {
-namespace lbp{
-
- #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
-
- template<typename T>
- __device__ __forceinline__ T __atomicInc(T* address, T val)
- {
- T count;
- unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
- do
- {
- count = *address & TAG_MASK;
- count = tag | (count + 1);
- *address = count;
- } while (*address != count);
-
- return (count & TAG_MASK) - 1;
- }
-
- template<typename T>
- __device__ __forceinline__ void __atomicAdd(T* address, T val)
- {
- T count;
- unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
- do
- {
- count = *address & TAG_MASK;
- count = tag | (count + val);
- *address = count;
- } while (*address != count);
- }
-
- template<typename T>
- __device__ __forceinline__ T __atomicMin(T* address, T val)
- {
- T count = min(*address, val);
- do
- {
- *address = count;
- } while (*address > count);
-
- return count;
- }
+namespace lbp {
struct Stage
{
unsigned tid = threadIdx.x;
labels[tid] = tid;
__syncthreads();
-
for (unsigned int id = 0; id < n; id++)
{
if (tid != id && predicate(vec[tid], vec[id]))
{
int p = labels[tid];
int q = labels[id];
-
- if (p != q)
- {
- int m = min(p, q);
-#if (__CUDA_ARCH__ < 120)
- __atomicMin(labels + id, m);
-#else
- atomicMin(labels + id, m);
-#endif
- }
+ if (p < q)
+ {
+ Emulation::smem::atomicMin(labels + id, p);
+ }
+ else if (p > q)
+ {
+ Emulation::smem::atomicMin(labels + tid, q);
+ }
}
}
__syncthreads();
}
+
} // lbp
} } }// namespaces