#include <opencv2/gpu/device/common.hpp>
#include <icf.hpp>
-// #include <opencv2/gpu/device/saturate_cast.hpp>
#include <stdio.h>
-// #include <float.h>
+#include <float.h>
-// //#define LOG_CUDA_CASCADE
+// #define LOG_CUDA_CASCADE
-// #if defined LOG_CUDA_CASCADE
-// # define dprintf(format, ...) \
-// do { printf(format, __VA_ARGS__); } while (0)
-// #else
-// # define dprintf(format, ...)
-// #endif
+#if defined LOG_CUDA_CASCADE
+# define dprintf(format, ...) \
+ do { printf(format, __VA_ARGS__); } while (0)
+#else
+# define dprintf(format, ...)
+#endif
namespace cv { namespace gpu { namespace device {
namespace icf {
cudaSafeCall( cudaDeviceSynchronize() );
}
- texture<float2, cudaTextureType1D, cudaReadModeElementType> tnode;
+ texture<int, cudaTextureType2D, cudaReadModeElementType> thogluv;
+ // ToDo: do it in load time
+ // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+ // {
+ // scaledRect = node.rect;
+ // return (float)(node.threshold & 0x0FFFFFFFU);
+ // }
+
+ __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+ {
+ float relScale = level.relScale;
+ float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+ dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y,
+ scaledRect.z, scaledRect.w);
+ dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1],
+ level.scaling[(node.threshold >> 28) > 6]);
+
+ // rescale
+ scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+ scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+ scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+ scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+ float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+ float approx = 1.f;
+ // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+ {
+ const float expected_new_area = farea * relScale * relScale;
+ approx = sarea / expected_new_area;
+ }
+
+ dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28),
+ scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+
+
+ float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
+ rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
+
+ dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
+ level.scaling[(node.threshold >> 28) > 6]);
+
+ return rootThreshold;
+ }
+
+ __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area)
+ {
+
+ dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+ dprintf("get for channel %d\n", channel);
+ dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+ x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w,
+ x + area.x, y + area.w);
+ dprintf("at point %d %d with offset %d\n", x, y, 0);
+
+ int offset = channel * 121;
+ y += offset;
+
+ int a = tex2D(thogluv, x + area.x, y + area.y);
+ int b = tex2D(thogluv, x + area.z, y + area.y);
+ int c = tex2D(thogluv, x + area.z, y + area.w);
+ int d = tex2D(thogluv, x + area.x, y + area.w);
+
+ dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d);
+
+ return (a - b + c - d);
+ }
+
__global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
- const Node* nodes,
- PtrStepSz<uchar4> objects)
+ const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
Level level = levels[blockIdx.z];
+
+ // if (x > 0 || y > 0 || blockIdx.z > 0) return;
if(x >= level.workRect.x || y >= level.workRect.y) return;
Octave octave = octaves[level.octave];
+
int st = octave.index * octave.stages;
const int stEnd = st + 1000;//octave.stages;
float confidence = 0.f;
-#pragma unroll 8
+// #pragma unroll 8
for(; st < stEnd; ++st)
{
+ dprintf("\n\nstage: %d\n", st);
const int nId = st * 3;
- const Node node = nodes[nId];
+ Node node = nodes[nId];
+
+ dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+ node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+
+ float threshold = rescale(level, node.rect, node);
+ int sum = get(x, y, (node.threshold >> 28), node.rect);
+
+ dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+ node.rect.w, threshold);
+
+ int next = 1 + (int)(sum >= threshold);
+ dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+
+ node = nodes[nId + next];
+ threshold = rescale(level, node.rect, node);
+ sum = get(x, y, (node.threshold >> 28), node.rect);
- const float stage = stages[st];
- confidence += node.rect.x * stage;
+ const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+ float impact = leaves[st * 4 + lShift];
+ confidence += impact;
+
+ if (confidence <= stages[st]) st = stEnd + 1;
+ dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+ dprintf("extracted stage: %f\n", stages[st]);
+ dprintf("computed score: %f\n\n", confidence);
}
+ // if (st == stEnd)
+ // printf("%d %d %d\n", x, y, st);
+
uchar4 val;
val.x = (int)confidence;
if (x == y) objects(0, threadIdx.x) = val;
}
void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
- const PtrStepSzb& nodes, const PtrStepSzb& features,
- PtrStepSz<uchar4> objects)
+ const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects)
{
int fw = 160;
int fh = 120;
+
dim3 block(32, 8);
dim3 grid(fw / 32, fh / 8, 47);
+
const Level* l = (const Level*)levels.ptr();
const Octave* oct = ((const Octave*)octaves.ptr());
const float* st = (const float*)stages.ptr();
const Node* nd = (const Node*)nodes.ptr();
- // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) );
+ const float* lf = (const float*)leaves.ptr();
+
+ cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+ cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
- test_kernel<<<grid, block>>>(l, oct, st, nd, objects);
+ test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects);
cudaSafeCall( cudaGetLastError());
cudaSafeCall( cudaDeviceSynchronize());
}
}
-}}}
-
-// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
-// PtrStepSz<uchar4> objects)
-// {
-// cascade.detectAt(hogluv, pitch, objects);
-// }
-
-// }
-
-// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
-// const int channel, const float threshold) const
-// {
-// dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-// dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
-
-// float relScale = level.relScale;
-// float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-// // rescale
-// scaledRect.x = __float2int_rn(relScale * scaledRect.x);
-// scaledRect.y = __float2int_rn(relScale * scaledRect.y);
-// scaledRect.z = __float2int_rn(relScale * scaledRect.z);
-// scaledRect.w = __float2int_rn(relScale * scaledRect.w);
-
-// float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-
-// float approx = 1.f;
-// if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-// {
-// const float expected_new_area = farea * relScale * relScale;
-// approx = expected_new_area / sarea;
-// }
-
-// dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel,
-// scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-// // compensation areas rounding
-// float rootThreshold = threshold / approx;
-// // printf(" approx %f\n", rootThreshold);
-// rootThreshold *= level.scaling[(int)(channel > 6)];
-
-// dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
-
-// return rootThreshold;
-// }
-
-// typedef unsigned char uchar;
-// float __device get(const int* __restrict__ hogluv, const int pitch,
-// const int x, const int y, int channel, uchar4 area)
-// {
-// dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-// dprintf("get for channel %d\n", channel);
-// dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
-// x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w,
-// x + area.x, y + area.w);
-// dprintf("at point %d %d with offset %d\n", x, y, 0);
-
-// const int* curr = hogluv + ((channel * 121) + y) * pitch;
-
-// int a = curr[area.y * pitch + x + area.x];
-// int b = curr[area.y * pitch + x + area.z];
-// int c = curr[area.w * pitch + x + area.z];
-// int d = curr[area.w * pitch + x + area.x];
-
-// dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d);
-
-// return (a - b + c - d);
-// }
-
-
-// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
-// PtrStepSz<uchar4>& objects) const
-// {
-// const icf::Level* lls = (const icf::Level*)levels.ptr();
-
-// const int y = blockIdx.y * blockDim.y + threadIdx.y;
-// const int x = blockIdx.x * blockDim.x + threadIdx.x;
-// // if (x > 0 || y > 0) return;
-
-// Level level = lls[blockIdx.z];
-// if (x >= level.workRect.x || y >= level.workRect.y) return;
-
-// dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-// level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
-
-// const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
-// // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
-// // octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
-
-// const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
-
-// float detectionScore = 0.f;
-
-// int st = stBegin;
-// for(; st < stEnd; ++st)
-// {
-// const float stage = stages(0, st);
-// dprintf("Stage: %f\n", stage);
-// {
-// const int nId = st * 3;
-
-// // work with root node
-// const Node node = ((const Node*)nodes.ptr())[nId];
-
-// dprintf("Node: %d %f\n", node.feature, node.threshold);
-
-// const Feature feature = ((const Feature*)features.ptr())[node.feature];
-
-// uchar4 scaledRect = feature.rect;
-// float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
-
-// float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
-
-// dprintf("root feature %d %f\n",feature.channel, sum);
-
-// int next = 1 + (int)(sum >= threshold);
-
-// dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
-
-// // leaves
-// const Node leaf = ((const Node*)nodes.ptr())[nId + next];
-// const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
-
-// scaledRect = fLeaf.rect;
-// threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
-// sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
-
-// const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-// float impact = leaves(0, (st * 4) + lShift);
-
-// detectionScore += impact;
-
-// dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-// dprintf("extracted stage:\n");
-// dprintf("ct %f\n", stage);
-// dprintf("computed score %f\n\n", detectionScore);
-// dprintf("\n\n");
-// }
-
-// if (detectionScore <= stage || st - stBegin == 100) break;
-// }
-
-// dprintf("x %d y %d: %d\n", x, y, st - stBegin);
-
-// if (st == stEnd)
-// {
-// uchar4 a;
-// a.x = level.workRect.x;
-// a.y = level.workRect.y;
-// objects(0, threadIdx.x) = a;
-// }
-// }
-
-// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
-// {
-// dim3 block(32, 8, 1);
-// dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
-// device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
-// cudaSafeCall( cudaGetLastError() );
-// if (!stream)
-// cudaSafeCall( cudaDeviceSynchronize() );
-// }
-
-// }}
\ No newline at end of file
+}}}
\ No newline at end of file
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
const int fw, const int fh, const int bins);
void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
- const PtrStepSzb& nodes, const PtrStepSzb& features,
- PtrStepSz<uchar4> objects);
+ const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects);
}
}}}
-// namespace {
-// char *itoa(long i, char* s, int /*dummy_radix*/)
-// {
-// sprintf(s, "%ld", i);
-// return s;
-// }
-// }
-
struct cv::gpu::SoftCascade::Filds
{
GpuMat stages;
GpuMat nodes;
GpuMat leaves;
- GpuMat features;
GpuMat levels;
// preallocated buffer 640x480x10 for hogluv + 640x480 got gray
bool fill(const FileNode &root, const float mins, const float maxs);
void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
{
- device::icf::detect(levels, octaves, stages, nodes, features, objects);
+ device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects);
}
private:
if (fn.empty()) return false;
std::vector<Octave> voctaves;
- std::vector<float> vstages;
+ std::vector<float> vstages;
std::vector<Node> vnodes;
- std::vector<float> vleaves;
- std::vector<Feature> vfeatures;
+ std::vector<float> vleaves;
scales.clear();
FileNodeIterator it = fn.begin(), it_end = fn.end();
FileNode ffs = fns[SC_FEATURES];
if (ffs.empty()) return false;
+ FileNodeIterator ftrs = ffs.begin();
+
fns = fns[SC_STAGES];
if (fn.empty()) return false;
FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
for (; inIt != inIt_end;)
{
- int feature = (int)(*(inIt +=2)++) + feature_offset;
- float th = (float)(*(inIt++));
+ // int feature = (int)(*(inIt +=2)) + feature_offset;
+ inIt +=3;
+ // extract feature, Todo:check it
+ uint th = saturate_cast<uint>((float)(*(inIt++)));
+ cv::FileNode ftn = (*ftrs)[SC_F_RECT];
+ cv::FileNodeIterator r_it = ftn.begin();
uchar4 rect;
- vnodes.push_back(Node(rect, th));
+ rect.x = saturate_cast<uchar>((int)*(r_it++));
+ rect.y = saturate_cast<uchar>((int)*(r_it++));
+ rect.z = saturate_cast<uchar>((int)*(r_it++));
+ rect.w = saturate_cast<uchar>((int)*(r_it++));
+
+ uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
+ vnodes.push_back(Node(rect, channel, th));
+ ++ftrs;
}
fns = (*ftr)[SC_LEAF];
}
}
- st = ffs.begin(), st_end = ffs.end();
- for (; st != st_end; ++st )
- {
- cv::FileNode rn = (*st)[SC_F_RECT];
- cv::FileNodeIterator r_it = rn.begin();
- uchar4 rect;
- rect.x = saturate_cast<uchar>((int)*(r_it++));
- rect.y = saturate_cast<uchar>((int)*(r_it++));
- rect.z = saturate_cast<uchar>((int)*(r_it++));
- rect.w = saturate_cast<uchar>((int)*(r_it++));
- vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect));
- }
-
feature_offset += octave.stages * 3;
++octIndex;
}
leaves.upload(cv::Mat(vleaves).reshape(1,1));
CV_Assert(!leaves.empty());
- features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
- CV_Assert(!features.empty());
-
// compute levels
calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
CV_Assert(!levels.empty());
return true;
}
-// #define USE_REFERENCE_VALUES
+#define USE_REFERENCE_VALUES
+namespace {
+ char *itoa(long i, char* s, int /*dummy_radix*/)
+ {
+ sprintf(s, "%ld", i);
+ return s;
+ }
+}
void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
GpuMat& objects, const int /*rejectfactor*/, Stream s)
{
Filds& flds = *filds;
#if defined USE_REFERENCE_VALUES
-// cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-// cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-// char buff[33];
-
-// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-// {
-// cv::Mat channel;
-// imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-// GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-// gchannel.upload(channel);
-// }
+ cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+
+ cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+ char buff[33];
+
+ for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+ {
+ cv::Mat channel;
+ imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+
+ // std::cout << "channel " << i << std::endl << channel << std::endl;
+ GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+ gchannel.upload(channel);
+ }
#else
GpuMat& plane = flds.plane;
GpuMat& shrunk = flds.shrunk;