optimize roi loads

author marina.kolpakova <marina.kolpakova@itseez.com>

Mon, 8 Oct 2012 11:37:28 +0000 (15:37 +0400)

committer marina.kolpakova <marina.kolpakova@itseez.com>

Sat, 10 Nov 2012 01:08:56 +0000 (05:08 +0400)
author marina.kolpakova <marina.kolpakova@itseez.com>
Mon, 8 Oct 2012 11:37:28 +0000 (15:37 +0400)
committer marina.kolpakova <marina.kolpakova@itseez.com>
Sat, 10 Nov 2012 01:08:56 +0000 (05:08 +0400)
diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp

index 2224194..e6efcc2 100644 (file)
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -176,33 +176,35 @@ PERF_TEST_P(SoftCascadeTest, detect,
  {
      if (runOnGpu)
      {
-        cv::Mat cpu = readImage (GetParam().second);
+        cv::Mat cpu = readImage (GET_PARAM(1));
          ASSERT_FALSE(cpu.empty());
          cv::gpu::GpuMat colored(cpu);
  
          cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
  
-        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
-
-        rois.setTo(0);
-        cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
-        sub.setTo(cv::Scalar::all(1));
-        cascade.detectMultiScale(colored, rois, objectBoxes);
+        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+        rois.setTo(1);
+        cv::gpu::transpose(rois, trois);
+        cascade.detectMultiScale(colored, trois, objectBoxes);
  
          TEST_CYCLE()
          {
-            cascade.detectMultiScale(colored, rois, objectBoxes);
+            cascade.detectMultiScale(colored, trois, objectBoxes);
          }
-    } else
+    }
+    else
      {
-        cv::Mat colored = readImage(GetParam().second);
+        cv::Mat colored = readImage(GET_PARAM(1));
          ASSERT_FALSE(colored.empty());
  
          cv::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(getDataPath(GetParam().first)));
+        ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
+
+        std::vector<cv::Rect> rois;
  
-        std::vector<cv::Rect> rois, objectBoxes;
+        typedef cv::SoftCascade::Detection Detection;
+        std::vector<Detection>objectBoxes;
          cascade.detectMultiScale(colored, rois, objectBoxes);
  
          TEST_CYCLE()
@@ -262,13 +264,16 @@ PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
              sub.setTo(1);
          }
  
+        cv::gpu::GpuMat trois;
+        cv::gpu::transpose(rois, trois);
+
          cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, rois, curr);
+        cascade.detectMultiScale(colored, trois, curr);
  
          TEST_CYCLE()
          {
              curr = objectBoxes;
-            cascade.detectMultiScale(colored, rois, curr);
+            cascade.detectMultiScale(colored, trois, curr);
          }
      }
      else
@@ -301,7 +306,10 @@ PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
          sub.setTo(1);
  
          cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, rois, curr);
+        cv::gpu::GpuMat trois;
+        cv::gpu::transpose(rois, trois);
+
+        cascade.detectMultiScale(colored, trois, curr);
  
          TEST_CYCLE()
          {
@@ -372,7 +380,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
      cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
      ASSERT_FALSE(img.empty());
  
-    if (PERF_RUN_GPU())
+    if (runOnGpu)
      {
          cv::gpu::CascadeClassifier_GPU d_cascade;
          ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu

index 4bde7f7..8df6907 100644 (file)
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -86,7 +86,6 @@ namespace icf {
      }
  
      texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
-    texture<char,  cudaTextureType2D, cudaReadModeElementType> troi;
  
      template<bool isUp>
      __device__ __forceinline__ float rescale(const Level& level, Node& node)
@@ -130,11 +129,6 @@ namespace icf {
          float relScale = level.relScale;
          float farea = scaledRect.z * scaledRect.w;
  
-        dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
-            scaledRect.z, scaledRect.w);
-        dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
-            level.scaling[(node.threshold >> 28) > 6]);
-
          // rescale
          scaledRect.x = __float2int_rn(relScale * scaledRect.x);
          scaledRect.y = __float2int_rn(relScale * scaledRect.y);
@@ -146,15 +140,7 @@ namespace icf {
          const float expected_new_area = farea * relScale * relScale;
          float approx = __fdividef(sarea, expected_new_area);
  
-        dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
-        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
-
-        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
-
-        dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
-            level.scaling[(node.threshold >> 28) > 6]);
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
  
          return rootThreshold;
      }
@@ -162,33 +148,17 @@ namespace icf {
      template<bool isUp>
      __device__ __forceinline__ int get(int x, int y, uchar4 area)
      {
-
-        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
-        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
-            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-            x + area.x, y + area.w);
-        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
-
          int a = tex2D(thogluv, x + area.x, y + area.y);
          int b = tex2D(thogluv, x + area.z, y + area.y);
          int c = tex2D(thogluv, x + area.z, y + area.w);
          int d = tex2D(thogluv, x + area.x, y + area.w);
  
-        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
-
          return (a - b + c - d);
      }
  
      template<>
      __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
      {
-
-        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
-        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
-            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-            x + area.x, y + area.w);
-        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
-
          x += area.x;
          y += area.y;
          int a = tex2D(thogluv, x, y);
@@ -196,11 +166,10 @@ namespace icf {
          int c = tex2D(thogluv, x + area.z, y + area.w);
          int d = tex2D(thogluv, x, y + area.w);
  
-        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
-
          return (a - b + c - d);
      }
  
+    texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
      template<bool isUp>
      __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
@@ -210,12 +179,21 @@ namespace icf {
          const int y = blockIdx.y * blockDim.y + threadIdx.y;
          const int x = blockIdx.x;
  
+        __shared__ volatile char roiCache[8];
+
+        if (!threadIdx.y && !threadIdx.x)
+        {
+            ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+        }
+
+        __syncthreads();
+
+        if (!roiCache[threadIdx.y]) return;
+
          Level level = levels[downscales + blockIdx.z];
  
          if(x >= level.workRect.x || y >= level.workRect.y) return;
  
-        if (!tex2D(troi, x, y)) return;
-
          Octave octave = octaves[level.octave];
          int st = octave.index * octave.stages;
          const int stEnd = st + 1024;
@@ -282,9 +260,9 @@ namespace icf {
          // if (blockIdx.z != 31) return;
          if(x >= level.workRect.x || y >= level.workRect.y) return;
  
-        int roi = tex2D(troi, x, y);
-        printf("%d\n", roi);
-        if (!roi) return;
+        // int roi = tex2D(troi, x, y);
+        // printf("%d\n", roi);
+        // if (!roi) return;
  
          Octave octave = octaves[level.octave];
  
@@ -357,8 +335,8 @@ namespace icf {
          cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
          cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
  
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
  
          test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
          cudaSafeCall( cudaGetLastError());
@@ -391,8 +369,8 @@ namespace icf {
          cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
          cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
  
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
  
          if (scale >= downscales)
              test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp

index 8b73ae6..e7fcfff 100644 (file)
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -481,7 +481,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
      CV_Assert(colored.type() == CV_8UC3);
  
      // we guess user knows about shrincage
-    CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1));
+    CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
  
      // only this window size allowed
      CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp

index 0b266f8..04fa9b1 100644 (file)
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -47,7 +47,7 @@
  using cv::gpu::GpuMat;
  
  // show detection results on input image with cv::imshow
-//#define SHOW_DETECTIONS
+#define SHOW_DETECTIONS
  
  #if defined SHOW_DETECTIONS
  # define SHOW(res)           \
@@ -154,26 +154,30 @@ GPU_TEST_P(SoftCascadeTest, detectInROI,
      cv::gpu::SoftCascade cascade;
      ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
  
-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
      rois.setTo(0);
  
      int nroi = GET_PARAM(2);
+    cv::Mat result(coloredCpu);
      cv::RNG rng;
      for (int i = 0; i < nroi; ++i)
      {
          cv::Rect r = getFromTable(rng(10));
          GpuMat sub(rois, r);
          sub.setTo(1);
+        r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4;
+        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
      }
  
-    cascade.detectMultiScale(colored, rois, objectBoxes);
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(colored, trois, objectBoxes);
  
      ///
      cv::Mat dt(objectBoxes);
      typedef cv::gpu::SoftCascade::Detection detection_t;
  
      detection_t* dts = (detection_t*)dt.data;
-    cv::Mat result(coloredCpu);
  
      printTotal(std::cout, dt.cols);
      for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
@@ -204,8 +208,11 @@ GPU_TEST_P(SoftCascadeTest, detectInLevel,
      GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
      rois.setTo(1);
  
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
      int level = GET_PARAM(2);
-    cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
+    cascade.detectMultiScale(colored, trois, objectBoxes, 1, level);
  
      cv::Mat dt(objectBoxes);
  
@@ -246,6 +253,9 @@ TEST(SoftCascadeTest, detect)
      GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
      sub.setTo(cv::Scalar::all(1));
  
-    cascade.detectMultiScale(colored, rois, objectBoxes);
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(colored, trois, objectBoxes);
  }
  #endif
 \ No newline at end of file
author	marina.kolpakova <marina.kolpakova@itseez.com>
	Mon, 8 Oct 2012 11:37:28 +0000 (15:37 +0400)
committer	marina.kolpakova <marina.kolpakova@itseez.com>
	Sat, 10 Nov 2012 01:08:56 +0000 (05:08 +0400)
modules/gpu/perf/perf_objdetect.cpp		patch \| blob \| history
modules/gpu/src/cuda/isf-sc.cu		patch \| blob \| history
modules/gpu/src/softcascade.cpp		patch \| blob \| history
modules/gpu/test/test_softcascade.cpp		patch \| blob \| history