modules/gpu/src/cuda/integral_image.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46
  47 namespace cv { namespace gpu { namespace device
  48 {
  49     namespace imgproc
  50     {
  51         // Utility function to extract unsigned chars from an unsigned integer
  52         __device__ uchar4 int_to_uchar4(unsigned int in)
  53         {
  54             uchar4 bytes;
  55             bytes.x = (in & 0x000000ff) >>  0;
  56             bytes.y = (in & 0x0000ff00) >>  8;
  57             bytes.z = (in & 0x00ff0000) >> 16;
  58             bytes.w = (in & 0xff000000) >> 24;
  59             return bytes;
  60         }
  61
  62         __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
  63         {
  64         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
  65             __shared__ int sums[128];
  66
  67             const int id = threadIdx.x;
  68             const int lane_id = id % warpSize;
  69             const int warp_id = id / warpSize;
  70
  71             const uint4 data = img(blockIdx.x, id);
  72
  73             const uchar4 a = int_to_uchar4(data.x);
  74             const uchar4 b = int_to_uchar4(data.y);
  75             const uchar4 c = int_to_uchar4(data.z);
  76             const uchar4 d = int_to_uchar4(data.w);
  77
  78             int result[16];
  79
  80             result[0]  =              a.x;
  81             result[1]  = result[0]  + a.y;
  82             result[2]  = result[1]  + a.z;
  83             result[3]  = result[2]  + a.w;
  84
  85             result[4]  = result[3]  + b.x;
  86             result[5]  = result[4]  + b.y;
  87             result[6]  = result[5]  + b.z;
  88             result[7]  = result[6]  + b.w;
  89
  90             result[8]  = result[7]  + c.x;
  91             result[9]  = result[8]  + c.y;
  92             result[10] = result[9]  + c.z;
  93             result[11] = result[10] + c.w;
  94
  95             result[12] = result[11] + d.x;
  96             result[13] = result[12] + d.y;
  97             result[14] = result[13] + d.z;
  98             result[15] = result[14] + d.w;
  99
 100             int sum = result[15];
 101
 102             // the prefix sum for each thread's 16 value is computed,
 103             // now the final sums (result[15]) need to be shared
 104             // with the other threads and add.  To do this,
 105             // the __shfl_up() instruction is used and a shuffle scan
 106             // operation is performed to distribute the sums to the correct
 107             // threads
 108             #pragma unroll
 109             for (int i = 1; i < 32; i *= 2)
 110             {
 111                 const int n = __shfl_up(sum, i, 32);
 112
 113                 if (lane_id >= i)
 114                 {
 115                     #pragma unroll
 116                     for (int i = 0; i < 16; ++i)
 117                         result[i] += n;
 118
 119                     sum += n;
 120                 }
 121             }
 122
 123             // Now the final sum for the warp must be shared
 124             // between warps.  This is done by each warp
 125             // having a thread store to shared memory, then
 126             // having some other warp load the values and
 127             // compute a prefix sum, again by using __shfl_up.
 128             // The results are uniformly added back to the warps.
 129             // last thread in the warp holding sum of the warp
 130             // places that in shared
 131             if (threadIdx.x % warpSize == warpSize - 1)
 132                 sums[warp_id] = result[15];
 133
 134             __syncthreads();
 135
 136             if (warp_id == 0)
 137             {
 138                 int warp_sum = sums[lane_id];
 139
 140                 #pragma unroll
 141                 for (int i = 1; i <= 32; i *= 2)
 142                 {
 143                     const int n = __shfl_up(warp_sum, i, 32);
 144
 145                     if (lane_id >= i)
 146                         warp_sum += n;
 147                 }
 148
 149                 sums[lane_id] = warp_sum;
 150             }
 151
 152             __syncthreads();
 153
 154             int blockSum = 0;
 155
 156             // fold in unused warp
 157             if (warp_id > 0)
 158             {
 159                 blockSum = sums[warp_id - 1];
 160
 161                 #pragma unroll
 162                 for (int i = 0; i < 16; ++i)
 163                     result[i] += blockSum;
 164             }
 165
 166             // assemble result
 167             // Each thread has 16 values to write, which are
 168             // now integer data (to avoid overflow).  Instead of
 169             // each thread writing consecutive uint4s, the
 170             // approach shown here experiments using
 171             // the shuffle command to reformat the data
 172             // inside the registers so that each thread holds
 173             // consecutive data to be written so larger contiguous
 174             // segments can be assembled for writing.
 175
 176             /*
 177                 For example data that needs to be written as
 178
 179                 GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
 180                 but is stored in registers (r0..r3), in four threads (0..3) as:
 181
 182                 threadId   0  1  2  3
 183                   r0      x0 y0 z0 w0
 184                   r1      x1 y1 z1 w1
 185                   r2      x2 y2 z2 w2
 186                   r3      x3 y3 z3 w3
 187
 188                   after apply __shfl_xor operations to move data between registers r1..r3:
 189
 190                 threadId  00 01 10 11
 191                           x0 y0 z0 w0
 192                  xor(01)->y1 x1 w1 z1
 193                  xor(10)->z2 w2 x2 y2
 194                  xor(11)->w3 z3 y3 x3
 195
 196                  and now x0..x3, and z0..z3 can be written out in order by all threads.
 197
 198                  In the current code, each register above is actually representing
 199                  four integers to be written as uint4's to GMEM.
 200             */
 201
 202             result[4]  = __shfl_xor(result[4] , 1, 32);
 203             result[5]  = __shfl_xor(result[5] , 1, 32);
 204             result[6]  = __shfl_xor(result[6] , 1, 32);
 205             result[7]  = __shfl_xor(result[7] , 1, 32);
 206
 207             result[8]  = __shfl_xor(result[8] , 2, 32);
 208             result[9]  = __shfl_xor(result[9] , 2, 32);
 209             result[10] = __shfl_xor(result[10], 2, 32);
 210             result[11] = __shfl_xor(result[11], 2, 32);
 211
 212             result[12] = __shfl_xor(result[12], 3, 32);
 213             result[13] = __shfl_xor(result[13], 3, 32);
 214             result[14] = __shfl_xor(result[14], 3, 32);
 215             result[15] = __shfl_xor(result[15], 3, 32);
 216
 217             uint4* integral_row = integral.ptr(blockIdx.x);
 218             uint4 output;
 219
 220             ///////
 221
 222             if (threadIdx.x % 4 == 0)
 223                 output = make_uint4(result[0], result[1], result[2], result[3]);
 224
 225             if (threadIdx.x % 4 == 1)
 226                 output = make_uint4(result[4], result[5], result[6], result[7]);
 227
 228             if (threadIdx.x % 4 == 2)
 229                 output = make_uint4(result[8], result[9], result[10], result[11]);
 230
 231             if (threadIdx.x % 4 == 3)
 232                 output = make_uint4(result[12], result[13], result[14], result[15]);
 233
 234             integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
 235
 236             ///////
 237
 238             if (threadIdx.x % 4 == 2)
 239                 output = make_uint4(result[0], result[1], result[2], result[3]);
 240
 241             if (threadIdx.x % 4 == 3)
 242                 output = make_uint4(result[4], result[5], result[6], result[7]);
 243
 244             if (threadIdx.x % 4 == 0)
 245                 output = make_uint4(result[8], result[9], result[10], result[11]);
 246
 247             if (threadIdx.x % 4 == 1)
 248                 output = make_uint4(result[12], result[13], result[14], result[15]);
 249
 250             integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
 251
 252             // continuning from the above example,
 253             // this use of __shfl_xor() places the y0..y3 and w0..w3 data
 254             // in order.
 255
 256             #pragma unroll
 257             for (int i = 0; i < 16; ++i)
 258                 result[i] = __shfl_xor(result[i], 1, 32);
 259
 260             if (threadIdx.x % 4 == 0)
 261                 output = make_uint4(result[0], result[1], result[2], result[3]);
 262
 263             if (threadIdx.x % 4 == 1)
 264                 output = make_uint4(result[4], result[5], result[6], result[7]);
 265
 266             if (threadIdx.x % 4 == 2)
 267                 output = make_uint4(result[8], result[9], result[10], result[11]);
 268
 269             if (threadIdx.x % 4 == 3)
 270                 output = make_uint4(result[12], result[13], result[14], result[15]);
 271
 272             integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
 273
 274             ///////
 275
 276             if (threadIdx.x % 4 == 2)
 277                 output = make_uint4(result[0], result[1], result[2], result[3]);
 278
 279             if (threadIdx.x % 4 == 3)
 280                 output = make_uint4(result[4], result[5], result[6], result[7]);
 281
 282             if (threadIdx.x % 4 == 0)
 283                 output = make_uint4(result[8], result[9], result[10], result[11]);
 284
 285             if (threadIdx.x % 4 == 1)
 286                 output = make_uint4(result[12], result[13], result[14], result[15]);
 287
 288             integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
 289         #endif
 290         }
 291
 292         // This kernel computes columnwise prefix sums.  When the data input is
 293         // the row sums from above, this completes the integral image.
 294         // The approach here is to have each block compute a local set of sums.
 295         // First , the data covered by the block is loaded into shared memory,
 296         // then instead of performing a sum in shared memory using __syncthreads
 297         // between stages, the data is reformatted so that the necessary sums
 298         // occur inside warps and the shuffle scan operation is used.
 299         // The final set of sums from the block is then propgated, with the block
 300         // computing "down" the image and adding the running sum to the local
 301         // block sums.
 302         __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
 303         {
 304         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
 305             __shared__ unsigned int sums[32][9];
 306
 307             const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
 308             const int lane_id = tidx % 8;
 309
 310             if (tidx >= integral.cols)
 311                 return;
 312
 313             sums[threadIdx.x][threadIdx.y] = 0;
 314             __syncthreads();
 315
 316             unsigned int stepSum = 0;
 317
 318             for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
 319             {
 320                 unsigned int* p = integral.ptr(y) + tidx;
 321
 322                 unsigned int sum = *p;
 323
 324                 sums[threadIdx.x][threadIdx.y] = sum;
 325                 __syncthreads();
 326
 327                 // place into SMEM
 328                 // shfl scan reduce the SMEM, reformating so the column
 329                 // sums are computed in a warp
 330                 // then read out properly
 331                 const int j = threadIdx.x % 8;
 332                 const int k = threadIdx.x / 8 + threadIdx.y * 4;
 333
 334                 int partial_sum = sums[k][j];
 335
 336                 for (int i = 1; i <= 8; i *= 2)
 337                 {
 338                     int n = __shfl_up(partial_sum, i, 32);
 339
 340                     if (lane_id >= i)
 341                         partial_sum += n;
 342                 }
 343
 344                 sums[k][j] = partial_sum;
 345                 __syncthreads();
 346
 347                 if (threadIdx.y > 0)
 348                     sum += sums[threadIdx.x][threadIdx.y - 1];
 349
 350                 sum += stepSum;
 351                 stepSum += sums[threadIdx.x][blockDim.y - 1];
 352
 353                 __syncthreads();
 354
 355                 *p = sum;
 356             }
 357         #endif
 358         }
 359
 360         void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
 361         {
 362             {
 363                 // each thread handles 16 values, use 1 block/row
 364                 // save, becouse step is actually can't be less 512 bytes
 365                 int block = integral.cols / 16;
 366
 367                 // launch 1 block / row
 368                 const int grid = img.rows;
 369
 370                 cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
 371
 372                 shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
 373                 cudaSafeCall( cudaGetLastError() );
 374             }
 375
 376             {
 377                 const dim3 block(32, 8);
 378                 const dim3 grid(divUp(integral.cols, block.x), 1);
 379
 380                 shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
 381                 cudaSafeCall( cudaGetLastError() );
 382             }
 383
 384             if (stream == 0)
 385                 cudaSafeCall( cudaDeviceSynchronize() );
 386         }
 387     }
 388 }}}
 389
 390 #endif /* CUDA_DISABLER */