modules/gpu/src/cuda/pyr_down.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/border_interpolate.hpp"
  47 #include "opencv2/gpu/device/vec_traits.hpp"
  48 #include "opencv2/gpu/device/vec_math.hpp"
  49 #include "opencv2/gpu/device/saturate_cast.hpp"
  50
  51 namespace cv { namespace gpu { namespace device
  52 {
  53     namespace imgproc
  54     {
  55         template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
  56         {
  57             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
  58
  59             __shared__ work_t smem[256 + 4];
  60
  61             const int x = blockIdx.x * blockDim.x + threadIdx.x;
  62             const int y = blockIdx.y;
  63
  64             const int src_y = 2 * y;
  65
  66             if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
  67             {
  68                 {
  69                     work_t sum;
  70
  71                     sum =       0.0625f * src(src_y - 2, x);
  72                     sum = sum + 0.25f   * src(src_y - 1, x);
  73                     sum = sum + 0.375f  * src(src_y    , x);
  74                     sum = sum + 0.25f   * src(src_y + 1, x);
  75                     sum = sum + 0.0625f * src(src_y + 2, x);
  76
  77                     smem[2 + threadIdx.x] = sum;
  78                 }
  79
  80                 if (threadIdx.x < 2)
  81                 {
  82                     const int left_x = x - 2;
  83
  84                     work_t sum;
  85
  86                     sum =       0.0625f * src(src_y - 2, left_x);
  87                     sum = sum + 0.25f   * src(src_y - 1, left_x);
  88                     sum = sum + 0.375f  * src(src_y    , left_x);
  89                     sum = sum + 0.25f   * src(src_y + 1, left_x);
  90                     sum = sum + 0.0625f * src(src_y + 2, left_x);
  91
  92                     smem[threadIdx.x] = sum;
  93                 }
  94
  95                 if (threadIdx.x > 253)
  96                 {
  97                     const int right_x = x + 2;
  98
  99                     work_t sum;
 100
 101                     sum =       0.0625f * src(src_y - 2, right_x);
 102                     sum = sum + 0.25f   * src(src_y - 1, right_x);
 103                     sum = sum + 0.375f  * src(src_y    , right_x);
 104                     sum = sum + 0.25f   * src(src_y + 1, right_x);
 105                     sum = sum + 0.0625f * src(src_y + 2, right_x);
 106
 107                     smem[4 + threadIdx.x] = sum;
 108                 }
 109             }
 110             else
 111             {
 112                 {
 113                     work_t sum;
 114
 115                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
 116                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
 117                     sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
 118                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
 119                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
 120
 121                     smem[2 + threadIdx.x] = sum;
 122                 }
 123
 124                 if (threadIdx.x < 2)
 125                 {
 126                     const int left_x = x - 2;
 127
 128                     work_t sum;
 129
 130                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
 131                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
 132                     sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
 133                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
 134                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
 135
 136                     smem[threadIdx.x] = sum;
 137                 }
 138
 139                 if (threadIdx.x > 253)
 140                 {
 141                     const int right_x = x + 2;
 142
 143                     work_t sum;
 144
 145                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
 146                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
 147                     sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
 148                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
 149                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
 150
 151                     smem[4 + threadIdx.x] = sum;
 152                 }
 153             }
 154
 155             __syncthreads();
 156
 157             if (threadIdx.x < 128)
 158             {
 159                 const int tid2 = threadIdx.x * 2;
 160
 161                 work_t sum;
 162
 163                 sum =       0.0625f * smem[2 + tid2 - 2];
 164                 sum = sum + 0.25f   * smem[2 + tid2 - 1];
 165                 sum = sum + 0.375f  * smem[2 + tid2    ];
 166                 sum = sum + 0.25f   * smem[2 + tid2 + 1];
 167                 sum = sum + 0.0625f * smem[2 + tid2 + 2];
 168
 169                 const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
 170
 171                 if (dst_x < dst_cols)
 172                     dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
 173             }
 174         }
 175
 176         template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
 177         {
 178             const dim3 block(256);
 179             const dim3 grid(divUp(src.cols, block.x), dst.rows);
 180
 181             B<T> b(src.rows, src.cols);
 182
 183             pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
 184             cudaSafeCall( cudaGetLastError() );
 185
 186             if (stream == 0)
 187                 cudaSafeCall( cudaDeviceSynchronize() );
 188         }
 189
 190         template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
 191         {
 192             pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
 193         }
 194
 195         template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 196         //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 197         template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 198         template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 199
 200         //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 201         //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 202         //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 203         //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 204
 205         template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 206         //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 207         template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 208         template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 209
 210         template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 211         //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 212         template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 213         template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 214
 215         //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 216         //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 217         //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 218         //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 219
 220         template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 221         //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 222         template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 223         template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 224     } // namespace imgproc
 225 }}} // namespace cv { namespace gpu { namespace device
 226
 227
 228 #endif /* CUDA_DISABLER */