\r
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;\r
\r
- //Upper halo\r
- #pragma unroll\r
- for (int j = 0; j < HALO_SIZE; ++j)\r
- smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));\r
+ if (blockIdx.y > 0)\r
+ {\r
+ //Upper halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));\r
+ }\r
+ else\r
+ {\r
+ //Upper halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));\r
+ }\r
\r
- //Main data\r
- #pragma unroll\r
- for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
- smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));\r
+ if (blockIdx.y + 2 < gridDim.y)\r
+ {\r
+ //Main data\r
+ #pragma unroll\r
+ for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+ smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));\r
\r
- //Lower halo\r
- #pragma unroll\r
- for (int j = 0; j < HALO_SIZE; ++j)\r
- smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));\r
+ //Lower halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));\r
+ }\r
+ else\r
+ {\r
+ //Main data\r
+ #pragma unroll\r
+ for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+ smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));\r
+\r
+ //Lower halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));\r
+ }\r
\r
__syncthreads();\r
\r
//\r
//M*/\r
\r
-#include "internal_shared.hpp"\r
+#include "opencv2/gpu/device/common.hpp"\r
#include "opencv2/gpu/device/border_interpolate.hpp"\r
#include "opencv2/gpu/device/vec_traits.hpp"\r
#include "opencv2/gpu/device/vec_math.hpp"\r
{\r
namespace imgproc\r
{\r
- template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
+ template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
{\r
- typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+ typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;\r
+\r
+ __shared__ work_t smem[256 + 4];\r
\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
const int y = blockIdx.y;\r
\r
- __shared__ value_type smem[256 + 4];\r
+ const int src_y = 2 * y;\r
\r
- value_type sum;\r
+ if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)\r
+ {\r
+ {\r
+ work_t sum;\r
\r
- const int src_y = 2*y;\r
+ sum = 0.0625f * src(src_y - 2, x);\r
+ sum = sum + 0.25f * src(src_y - 1, x);\r
+ sum = sum + 0.375f * src(src_y , x);\r
+ sum = sum + 0.25f * src(src_y + 1, x);\r
+ sum = sum + 0.0625f * src(src_y + 2, x);\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ smem[2 + threadIdx.x] = sum;\r
+ }\r
\r
- sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
+ if (threadIdx.x < 2)\r
+ {\r
+ const int left_x = x - 2;\r
\r
- smem[2 + threadIdx.x] = sum;\r
+ work_t sum;\r
\r
- if (threadIdx.x < 2)\r
- {\r
- const int left_x = x - 2;\r
+ sum = 0.0625f * src(src_y - 2, left_x);\r
+ sum = sum + 0.25f * src(src_y - 1, left_x);\r
+ sum = sum + 0.375f * src(src_y , left_x);\r
+ sum = sum + 0.25f * src(src_y + 1, left_x);\r
+ sum = sum + 0.0625f * src(src_y + 2, left_x);\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ smem[threadIdx.x] = sum;\r
+ }\r
\r
- sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
+ if (threadIdx.x > 253)\r
+ {\r
+ const int right_x = x + 2;\r
\r
- smem[threadIdx.x] = sum;\r
- }\r
+ work_t sum;\r
\r
- if (threadIdx.x > 253)\r
+ sum = 0.0625f * src(src_y - 2, right_x);\r
+ sum = sum + 0.25f * src(src_y - 1, right_x);\r
+ sum = sum + 0.375f * src(src_y , right_x);\r
+ sum = sum + 0.25f * src(src_y + 1, right_x);\r
+ sum = sum + 0.0625f * src(src_y + 2, right_x);\r
+\r
+ smem[4 + threadIdx.x] = sum;\r
+ }\r
+ }\r
+ else\r
{\r
- const int right_x = x + 2;\r
+ {\r
+ work_t sum;\r
+\r
+ sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));\r
+ sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));\r
+ sum = sum + 0.375f * src(src_y , b.idx_col_high(x));\r
+ sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));\r
+ sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));\r
+\r
+ smem[2 + threadIdx.x] = sum;\r
+ }\r
+\r
+ if (threadIdx.x < 2)\r
+ {\r
+ const int left_x = x - 2;\r
+\r
+ work_t sum;\r
+\r
+ sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));\r
+ sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));\r
+ sum = sum + 0.375f * src(src_y , b.idx_col(left_x));\r
+ sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));\r
+ sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));\r
+\r
+ smem[threadIdx.x] = sum;\r
+ }\r
+\r
+ if (threadIdx.x > 253)\r
+ {\r
+ const int right_x = x + 2;\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ work_t sum;\r
\r
- sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);\r
- sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);\r
- sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);\r
- sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
+ sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));\r
+ sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));\r
+ sum = sum + 0.375f * src(src_y , b.idx_col_high(right_x));\r
+ sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));\r
+ sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));\r
\r
- smem[4 + threadIdx.x] = sum;\r
+ smem[4 + threadIdx.x] = sum;\r
+ }\r
}\r
\r
__syncthreads();\r
{\r
const int tid2 = threadIdx.x * 2;\r
\r
- sum = VecTraits<value_type>::all(0);\r
+ work_t sum;\r
\r
- sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
+ sum = 0.0625f * smem[2 + tid2 - 2];\r
sum = sum + 0.25f * smem[2 + tid2 - 1];\r
sum = sum + 0.375f * smem[2 + tid2 ];\r
sum = sum + 0.25f * smem[2 + tid2 + 1];\r
\r
const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;\r
\r
- //Load left halo\r
- #pragma unroll\r
- for (int j = 0; j < HALO_SIZE; ++j)\r
- smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));\r
+ if (blockIdx.x > 0)\r
+ {\r
+ //Load left halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);\r
+ }\r
+ else\r
+ {\r
+ //Load left halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));\r
+ }\r
\r
- //Load main data\r
- #pragma unroll\r
- for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
- smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));\r
+ if (blockIdx.x + 2 < gridDim.x)\r
+ {\r
+ //Load main data\r
+ #pragma unroll\r
+ for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+ smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);\r
\r
- //Load right halo\r
- #pragma unroll\r
- for (int j = 0; j < HALO_SIZE; ++j)\r
- smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));\r
+ //Load right halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);\r
+ }\r
+ else\r
+ {\r
+ //Load main data\r
+ #pragma unroll\r
+ for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
+ smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));\r
+\r
+ //Load right halo\r
+ #pragma unroll\r
+ for (int j = 0; j < HALO_SIZE; ++j)\r
+ smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));\r
+ }\r
\r
__syncthreads();\r
\r