optimized gpu pyrDown and sepFilter2D
authorVladislav Vinogradov <vlad.vinogradov@itseez.com>
Thu, 13 Sep 2012 06:21:43 +0000 (10:21 +0400)
committerVladislav Vinogradov <vlad.vinogradov@itseez.com>
Thu, 13 Sep 2012 06:21:43 +0000 (10:21 +0400)
modules/gpu/perf/perf_imgproc.cpp
modules/gpu/src/cuda/column_filter.cu
modules/gpu/src/cuda/pyr_down.cu
modules/gpu/src/cuda/row_filter.cu

index 9769bfa..4c1190a 100644 (file)
@@ -1720,7 +1720,7 @@ CV_FLAGS(GHMethod, cv::GHT_POSITION, cv::GHT_SCALE, cv::GHT_ROTATION);
 \r
 DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size);\r
 \r
-PERF_TEST_P(Method_Sz, GeneralizedHough, Combine(\r
+PERF_TEST_P(Method_Sz, ImgProc_GeneralizedHough, Combine(\r
             Values(GHMethod(cv::GHT_POSITION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE), GHMethod(cv::GHT_POSITION | cv::GHT_ROTATION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE | cv::GHT_ROTATION)),\r
             GPU_TYPICAL_MAT_SIZES))\r
 {\r
index 307e87a..f283bf6 100644 (file)
@@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device
 \r
             const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;\r
 \r
-            //Upper halo\r
-            #pragma unroll\r
-            for (int j = 0; j < HALO_SIZE; ++j)\r
-                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));\r
+            if (blockIdx.y > 0)\r
+            {\r
+                //Upper halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));\r
+            }\r
+            else\r
+            {\r
+                //Upper halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));\r
+            }\r
 \r
-            //Main data\r
-            #pragma unroll\r
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
-                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));\r
+            if (blockIdx.y + 2 < gridDim.y)\r
+            {\r
+                //Main data\r
+                #pragma unroll\r
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));\r
 \r
-            //Lower halo\r
-            #pragma unroll\r
-            for (int j = 0; j < HALO_SIZE; ++j)\r
-                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));\r
+                //Lower halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));\r
+            }\r
+            else\r
+            {\r
+                //Main data\r
+                #pragma unroll\r
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));\r
+\r
+                //Lower halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));\r
+            }\r
 \r
             __syncthreads();\r
 \r
index 12c485f..ca9a598 100644 (file)
@@ -40,7 +40,7 @@
 //\r
 //M*/\r
 \r
-#include "internal_shared.hpp"\r
+#include "opencv2/gpu/device/common.hpp"\r
 #include "opencv2/gpu/device/border_interpolate.hpp"\r
 #include "opencv2/gpu/device/vec_traits.hpp"\r
 #include "opencv2/gpu/device/vec_math.hpp"\r
@@ -50,57 +50,104 @@ namespace cv { namespace gpu { namespace device
 {\r
     namespace imgproc\r
     {\r
-        template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
         {\r
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;\r
+\r
+            __shared__ work_t smem[256 + 4];\r
 \r
             const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
             const int y = blockIdx.y;\r
 \r
-            __shared__ value_type smem[256 + 4];\r
+            const int src_y = 2 * y;\r
 \r
-            value_type sum;\r
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)\r
+            {\r
+                {\r
+                    work_t sum;\r
 \r
-            const int src_y = 2*y;\r
+                    sum =       0.0625f * src(src_y - 2, x);\r
+                    sum = sum + 0.25f   * src(src_y - 1, x);\r
+                    sum = sum + 0.375f  * src(src_y    , x);\r
+                    sum = sum + 0.25f   * src(src_y + 1, x);\r
+                    sum = sum + 0.0625f * src(src_y + 2, x);\r
 \r
-            sum = VecTraits<value_type>::all(0);\r
+                    smem[2 + threadIdx.x] = sum;\r
+                }\r
 \r
-            sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);\r
-            sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);\r
-            sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
+                if (threadIdx.x < 2)\r
+                {\r
+                    const int left_x = x - 2;\r
 \r
-            smem[2 + threadIdx.x] = sum;\r
+                    work_t sum;\r
 \r
-            if (threadIdx.x < 2)\r
-            {\r
-                const int left_x = x - 2;\r
+                    sum =       0.0625f * src(src_y - 2, left_x);\r
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);\r
+                    sum = sum + 0.375f  * src(src_y    , left_x);\r
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);\r
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);\r
 \r
-                sum = VecTraits<value_type>::all(0);\r
+                    smem[threadIdx.x] = sum;\r
+                }\r
 \r
-                sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
-                sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);\r
-                sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);\r
-                sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);\r
-                sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
+                if (threadIdx.x > 253)\r
+                {\r
+                    const int right_x = x + 2;\r
 \r
-                smem[threadIdx.x] = sum;\r
-            }\r
+                    work_t sum;\r
 \r
-            if (threadIdx.x > 253)\r
+                    sum =       0.0625f * src(src_y - 2, right_x);\r
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);\r
+                    sum = sum + 0.375f  * src(src_y    , right_x);\r
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);\r
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);\r
+\r
+                    smem[4 + threadIdx.x] = sum;\r
+                }\r
+            }\r
+            else\r
             {\r
-                const int right_x = x + 2;\r
+                {\r
+                    work_t sum;\r
+\r
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));\r
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));\r
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));\r
+\r
+                    smem[2 + threadIdx.x] = sum;\r
+                }\r
+\r
+                if (threadIdx.x < 2)\r
+                {\r
+                    const int left_x = x - 2;\r
+\r
+                    work_t sum;\r
+\r
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));\r
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));\r
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));\r
+\r
+                    smem[threadIdx.x] = sum;\r
+                }\r
+\r
+                if (threadIdx.x > 253)\r
+                {\r
+                    const int right_x = x + 2;\r
 \r
-                sum = VecTraits<value_type>::all(0);\r
+                    work_t sum;\r
 \r
-                sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
-                sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);\r
-                sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);\r
-                sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);\r
-                sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));\r
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));\r
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));\r
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));\r
 \r
-                smem[4 + threadIdx.x] = sum;\r
+                    smem[4 + threadIdx.x] = sum;\r
+                }\r
             }\r
 \r
             __syncthreads();\r
@@ -109,9 +156,9 @@ namespace cv { namespace gpu { namespace device
             {\r
                 const int tid2 = threadIdx.x * 2;\r
 \r
-                sum = VecTraits<value_type>::all(0);\r
+                work_t sum;\r
 \r
-                sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
+                sum =       0.0625f * smem[2 + tid2 - 2];\r
                 sum = sum + 0.25f   * smem[2 + tid2 - 1];\r
                 sum = sum + 0.375f  * smem[2 + tid2    ];\r
                 sum = sum + 0.25f   * smem[2 + tid2 + 1];\r
index 8d96c79..5b3d044 100644 (file)
@@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device
 \r
             const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;\r
 \r
-            //Load left halo\r
-            #pragma unroll\r
-            for (int j = 0; j < HALO_SIZE; ++j)\r
-                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));\r
+            if (blockIdx.x > 0)\r
+            {\r
+                //Load left halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);\r
+            }\r
+            else\r
+            {\r
+                //Load left halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));\r
+            }\r
 \r
-            //Load main data\r
-            #pragma unroll\r
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
-                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));\r
+            if (blockIdx.x + 2 < gridDim.x)\r
+            {\r
+                //Load main data\r
+                #pragma unroll\r
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);\r
 \r
-            //Load right halo\r
-            #pragma unroll\r
-            for (int j = 0; j < HALO_SIZE; ++j)\r
-                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));\r
+                //Load right halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);\r
+            }\r
+            else\r
+            {\r
+                //Load main data\r
+                #pragma unroll\r
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));\r
+\r
+                //Load right halo\r
+                #pragma unroll\r
+                for (int j = 0; j < HALO_SIZE; ++j)\r
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));\r
+            }\r
 \r
             __syncthreads();\r
 \r