From 672cf1f44576bb82f1202c0c15c947b47469304b Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 1 Oct 2012 14:50:28 +0400 Subject: [PATCH] implement different behaviour for up- and down-scaling --- modules/gpu/src/cuda/isf-sc.cu | 98 +++++++++++++++++++++++++++++------ modules/gpu/src/softcascade.cpp | 13 ++++- modules/gpu/test/test_softcascade.cpp | 2 +- 3 files changed, 95 insertions(+), 18 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index cc45024..c9a92e3 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -86,8 +86,11 @@ namespace icf { } texture thogluv; - __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + + template + __device__ __forceinline__ float rescale(const Level& level, Node& node) { + uchar4& scaledRect = node.rect; float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); @@ -119,7 +122,44 @@ namespace icf { return rootThreshold; } - __device__ __forceinline__ int get(const int x, int y, uchar4 area) + template<> + __device__ __forceinline__ float rescale(const Level& level, Node& node) + { + uchar4& scaledRect = node.rect; + float relScale = level.relScale; + float farea = scaledRect.z * scaledRect.w; + + dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, + scaledRect.z, scaledRect.w); + dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], + level.scaling[(node.threshold >> 28) > 6]); + + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = scaledRect.z * scaledRect.w; + + const float expected_new_area = farea * relScale * relScale; + float approx = __fdividef(sarea, expected_new_area); + + dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), + scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); + + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; + + rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; + + dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, + level.scaling[(node.threshold >> 28) > 6]); + + return rootThreshold; + } + + template + __device__ __forceinline__ int get(int x, int y, uchar4 area) { dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); @@ -138,7 +178,30 @@ namespace icf { return (a - b + c - d); } + template<> + __device__ __forceinline__ int get(int x, int y, uchar4 area) + { + + dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); + dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, + x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, + x + area.x, y + area.w); + dprintf("%d: at point %d %d with offset %d\n", x, y, 0); + + x += area.x; + y += area.y; + int a = tex2D(thogluv, x, y); + int b = tex2D(thogluv, x + area.z, y); + int c = tex2D(thogluv, x + area.z, y + area.w); + int d = tex2D(thogluv, x, y + area.w); + + dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); + + return (a - b + c - d); + } + #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 + template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, const int downscales) @@ -163,15 +226,15 @@ namespace icf { dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId); Node node = nodes[nId]; - float threshold = rescale(level, node.rect, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); int next = 1 + (int)(sum >= threshold); dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); node = nodes[nId + next]; - threshold = rescale(level, node.rect, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; @@ -192,7 +255,7 @@ namespace icf { dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); confidence += impact; - if(__any((confidence <= stages[(st + threadIdx.x)]))) break; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += stEnd; } if(st == stEnd && !threadIdx.x) @@ -204,6 +267,7 @@ namespace icf { } } #else + template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, const int downscales) @@ -231,8 +295,8 @@ namespace icf { dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, node.threshold >> 28, node.threshold & 0x0FFFFFFFU); - float threshold = rescale(level, node.rect, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, threshold); @@ -241,8 +305,8 @@ namespace icf { dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); node = nodes[nId + next]; - threshold = rescale(level, node.rect, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[st * 4 + lShift]; @@ -286,18 +350,18 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter) + PtrStepSzi counter, const int downscales) { int fw = 160; int fh = 120; @@ -317,7 +381,11 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + if (scale >= downscales) + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + else + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index fd94909..8868aa5 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -68,7 +68,7 @@ namespace icf { PtrStepSzi counter, const int downscales); void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter); + PtrStepSzi counter, const int downscales); } }}} @@ -147,7 +147,7 @@ struct cv::gpu::SoftCascade::Filds { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects, - detCounter); + detCounter, downscales); } private: @@ -240,6 +240,9 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c { FileNode fns = *it; float scale = (float)fns[SC_OCT_SCALE]; + + bool isUPOctave = scale >= 1; + scales.push_back(scale); ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); ushort2 size; @@ -286,6 +289,12 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c rect.z = saturate_cast((int)*(r_it++)); rect.w = saturate_cast((int)*(r_it++)); + if (isUPOctave) + { + rect.z -= rect.x; + rect.w -= rect.y; + } + uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); vnodes.push_back(Node(rect, channel, th)); ++ftrs; diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 0da0729..4d1a4b7 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -63,7 +63,7 @@ TEST(SoftCascade, detect) cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois; + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois; // ASSERT_NO_THROW( // { -- 2.7.4