modules/softcascade/src/cuda/channels.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "opencv2/core/cuda_types.hpp"
  44 #include "opencv2/core/cuda/common.hpp"
  45
  46 namespace cv { namespace softcascade { namespace cudev
  47 {
  48     typedef unsigned int uint;
  49     typedef unsigned short ushort;
  50
  51     // Utility function to extract unsigned chars from an unsigned integer
  52     __device__ uchar4 int_to_uchar4(unsigned int in)
  53     {
  54         uchar4 bytes;
  55         bytes.x = (in & 0x000000ff) >>  0;
  56         bytes.y = (in & 0x0000ff00) >>  8;
  57         bytes.z = (in & 0x00ff0000) >> 16;
  58         bytes.w = (in & 0xff000000) >> 24;
  59         return bytes;
  60     }
  61
  62     __global__ void shfl_integral_horizontal(const cv::cuda::PtrStep<uint4> img, cv::cuda::PtrStep<uint4> integral)
  63     {
  64     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
  65         __shared__ int sums[128];
  66
  67         const int id = threadIdx.x;
  68         const int lane_id = id % warpSize;
  69         const int warp_id = id / warpSize;
  70
  71         const uint4 data = img(blockIdx.x, id);
  72
  73         const uchar4 a = int_to_uchar4(data.x);
  74         const uchar4 b = int_to_uchar4(data.y);
  75         const uchar4 c = int_to_uchar4(data.z);
  76         const uchar4 d = int_to_uchar4(data.w);
  77
  78         int result[16];
  79
  80         result[0]  =              a.x;
  81         result[1]  = result[0]  + a.y;
  82         result[2]  = result[1]  + a.z;
  83         result[3]  = result[2]  + a.w;
  84
  85         result[4]  = result[3]  + b.x;
  86         result[5]  = result[4]  + b.y;
  87         result[6]  = result[5]  + b.z;
  88         result[7]  = result[6]  + b.w;
  89
  90         result[8]  = result[7]  + c.x;
  91         result[9]  = result[8]  + c.y;
  92         result[10] = result[9]  + c.z;
  93         result[11] = result[10] + c.w;
  94
  95         result[12] = result[11] + d.x;
  96         result[13] = result[12] + d.y;
  97         result[14] = result[13] + d.z;
  98         result[15] = result[14] + d.w;
  99
 100         int sum = result[15];
 101
 102         // the prefix sum for each thread's 16 value is computed,
 103         // now the final sums (result[15]) need to be shared
 104         // with the other threads and add.  To do this,
 105         // the __shfl_up() instruction is used and a shuffle scan
 106         // operation is performed to distribute the sums to the correct
 107         // threads
 108         #pragma unroll
 109         for (int i = 1; i < 32; i *= 2)
 110         {
 111             const int n = __shfl_up(sum, i, 32);
 112
 113             if (lane_id >= i)
 114             {
 115                 #pragma unroll
 116                 for (int i = 0; i < 16; ++i)
 117                     result[i] += n;
 118
 119                 sum += n;
 120             }
 121         }
 122
 123         // Now the final sum for the warp must be shared
 124         // between warps.  This is done by each warp
 125         // having a thread store to shared memory, then
 126         // having some other warp load the values and
 127         // compute a prefix sum, again by using __shfl_up.
 128         // The results are uniformly added back to the warps.
 129         // last thread in the warp holding sum of the warp
 130         // places that in shared
 131         if (threadIdx.x % warpSize == warpSize - 1)
 132             sums[warp_id] = result[15];
 133
 134         __syncthreads();
 135
 136         if (warp_id == 0)
 137         {
 138             int warp_sum = sums[lane_id];
 139
 140             #pragma unroll
 141             for (int i = 1; i <= 32; i *= 2)
 142             {
 143                 const int n = __shfl_up(warp_sum, i, 32);
 144
 145                 if (lane_id >= i)
 146                     warp_sum += n;
 147             }
 148
 149             sums[lane_id] = warp_sum;
 150         }
 151
 152         __syncthreads();
 153
 154         int blockSum = 0;
 155
 156         // fold in unused warp
 157         if (warp_id > 0)
 158         {
 159             blockSum = sums[warp_id - 1];
 160
 161             #pragma unroll
 162             for (int i = 0; i < 16; ++i)
 163                 result[i] += blockSum;
 164         }
 165
 166         // assemble result
 167         // Each thread has 16 values to write, which are
 168         // now integer data (to avoid overflow).  Instead of
 169         // each thread writing consecutive uint4s, the
 170         // approach shown here experiments using
 171         // the shuffle command to reformat the data
 172         // inside the registers so that each thread holds
 173         // consecutive data to be written so larger contiguous
 174         // segments can be assembled for writing.
 175
 176         /*
 177             For example data that needs to be written as
 178
 179             GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
 180             but is stored in registers (r0..r3), in four threads (0..3) as:
 181
 182             threadId   0  1  2  3
 183               r0      x0 y0 z0 w0
 184               r1      x1 y1 z1 w1
 185               r2      x2 y2 z2 w2
 186               r3      x3 y3 z3 w3
 187
 188               after apply __shfl_xor operations to move data between registers r1..r3:
 189
 190             threadId  00 01 10 11
 191                       x0 y0 z0 w0
 192              xor(01)->y1 x1 w1 z1
 193              xor(10)->z2 w2 x2 y2
 194              xor(11)->w3 z3 y3 x3
 195
 196              and now x0..x3, and z0..z3 can be written out in order by all threads.
 197
 198              In the current code, each register above is actually representing
 199              four integers to be written as uint4's to GMEM.
 200         */
 201
 202         result[4]  = __shfl_xor(result[4] , 1, 32);
 203         result[5]  = __shfl_xor(result[5] , 1, 32);
 204         result[6]  = __shfl_xor(result[6] , 1, 32);
 205         result[7]  = __shfl_xor(result[7] , 1, 32);
 206
 207         result[8]  = __shfl_xor(result[8] , 2, 32);
 208         result[9]  = __shfl_xor(result[9] , 2, 32);
 209         result[10] = __shfl_xor(result[10], 2, 32);
 210         result[11] = __shfl_xor(result[11], 2, 32);
 211
 212         result[12] = __shfl_xor(result[12], 3, 32);
 213         result[13] = __shfl_xor(result[13], 3, 32);
 214         result[14] = __shfl_xor(result[14], 3, 32);
 215         result[15] = __shfl_xor(result[15], 3, 32);
 216
 217         uint4* integral_row = integral.ptr(blockIdx.x);
 218         uint4 output;
 219
 220         ///////
 221
 222         if (threadIdx.x % 4 == 0)
 223             output = make_uint4(result[0], result[1], result[2], result[3]);
 224
 225         if (threadIdx.x % 4 == 1)
 226             output = make_uint4(result[4], result[5], result[6], result[7]);
 227
 228         if (threadIdx.x % 4 == 2)
 229             output = make_uint4(result[8], result[9], result[10], result[11]);
 230
 231         if (threadIdx.x % 4 == 3)
 232             output = make_uint4(result[12], result[13], result[14], result[15]);
 233
 234         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
 235
 236         ///////
 237
 238         if (threadIdx.x % 4 == 2)
 239             output = make_uint4(result[0], result[1], result[2], result[3]);
 240
 241         if (threadIdx.x % 4 == 3)
 242             output = make_uint4(result[4], result[5], result[6], result[7]);
 243
 244         if (threadIdx.x % 4 == 0)
 245             output = make_uint4(result[8], result[9], result[10], result[11]);
 246
 247         if (threadIdx.x % 4 == 1)
 248             output = make_uint4(result[12], result[13], result[14], result[15]);
 249
 250         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
 251
 252         // continuning from the above example,
 253         // this use of __shfl_xor() places the y0..y3 and w0..w3 data
 254         // in order.
 255
 256         #pragma unroll
 257         for (int i = 0; i < 16; ++i)
 258             result[i] = __shfl_xor(result[i], 1, 32);
 259
 260         if (threadIdx.x % 4 == 0)
 261             output = make_uint4(result[0], result[1], result[2], result[3]);
 262
 263         if (threadIdx.x % 4 == 1)
 264             output = make_uint4(result[4], result[5], result[6], result[7]);
 265
 266         if (threadIdx.x % 4 == 2)
 267             output = make_uint4(result[8], result[9], result[10], result[11]);
 268
 269         if (threadIdx.x % 4 == 3)
 270             output = make_uint4(result[12], result[13], result[14], result[15]);
 271
 272         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
 273
 274         ///////
 275
 276         if (threadIdx.x % 4 == 2)
 277             output = make_uint4(result[0], result[1], result[2], result[3]);
 278
 279         if (threadIdx.x % 4 == 3)
 280             output = make_uint4(result[4], result[5], result[6], result[7]);
 281
 282         if (threadIdx.x % 4 == 0)
 283             output = make_uint4(result[8], result[9], result[10], result[11]);
 284
 285         if (threadIdx.x % 4 == 1)
 286             output = make_uint4(result[12], result[13], result[14], result[15]);
 287
 288         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
 289     #endif
 290     }
 291
 292     // This kernel computes columnwise prefix sums.  When the data input is
 293     // the row sums from above, this completes the integral image.
 294     // The approach here is to have each block compute a local set of sums.
 295     // First , the data covered by the block is loaded into shared memory,
 296     // then instead of performing a sum in shared memory using __syncthreads
 297     // between stages, the data is reformatted so that the necessary sums
 298     // occur inside warps and the shuffle scan operation is used.
 299     // The final set of sums from the block is then propgated, with the block
 300     // computing "down" the image and adding the running sum to the local
 301     // block sums.
 302     __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> integral)
 303     {
 304     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
 305         __shared__ unsigned int sums[32][9];
 306
 307         const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
 308         const int lane_id = tidx % 8;
 309
 310         if (tidx >= integral.cols)
 311             return;
 312
 313         sums[threadIdx.x][threadIdx.y] = 0;
 314         __syncthreads();
 315
 316         unsigned int stepSum = 0;
 317
 318         for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
 319         {
 320             unsigned int* p = integral.ptr(y) + tidx;
 321
 322             unsigned int sum = *p;
 323
 324             sums[threadIdx.x][threadIdx.y] = sum;
 325             __syncthreads();
 326
 327             // place into SMEM
 328             // shfl scan reduce the SMEM, reformating so the column
 329             // sums are computed in a warp
 330             // then read out properly
 331             const int j = threadIdx.x % 8;
 332             const int k = threadIdx.x / 8 + threadIdx.y * 4;
 333
 334             int partial_sum = sums[k][j];
 335
 336             for (int i = 1; i <= 8; i *= 2)
 337             {
 338                 int n = __shfl_up(partial_sum, i, 32);
 339
 340                 if (lane_id >= i)
 341                     partial_sum += n;
 342             }
 343
 344             sums[k][j] = partial_sum;
 345             __syncthreads();
 346
 347             if (threadIdx.y > 0)
 348                 sum += sums[threadIdx.x][threadIdx.y - 1];
 349
 350             sum += stepSum;
 351             stepSum += sums[threadIdx.x][blockDim.y - 1];
 352
 353             __syncthreads();
 354
 355             *p = sum;
 356         }
 357     #endif
 358     }
 359
 360     void shfl_integral(const cv::cuda::PtrStepSzb& img, cv::cuda::PtrStepSz<unsigned int> integral, cudaStream_t stream)
 361     {
 362         {
 363             // each thread handles 16 values, use 1 block/row
 364             // save, becouse step is actually can't be less 512 bytes
 365             int block = integral.cols / 16;
 366
 367             // launch 1 block / row
 368             const int grid = img.rows;
 369
 370             cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
 371
 372             shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::cuda::PtrStepSz<uint4>) img, (cv::cuda::PtrStepSz<uint4>) integral);
 373             cudaSafeCall( cudaGetLastError() );
 374         }
 375
 376         {
 377             const dim3 block(32, 8);
 378             const dim3 grid(cv::cuda::device::divUp(integral.cols, block.x), 1);
 379
 380             shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
 381             cudaSafeCall( cudaGetLastError() );
 382         }
 383
 384         if (stream == 0)
 385             cudaSafeCall( cudaDeviceSynchronize() );
 386     }
 387
 388     __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> buffer, cv::cuda::PtrStepSz<unsigned int> integral)
 389     {
 390     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
 391         __shared__ unsigned int sums[32][9];
 392
 393         const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
 394         const int lane_id = tidx % 8;
 395
 396         if (tidx >= integral.cols)
 397             return;
 398
 399         sums[threadIdx.x][threadIdx.y] = 0;
 400         __syncthreads();
 401
 402         unsigned int stepSum = 0;
 403
 404         for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
 405         {
 406             unsigned int* p = buffer.ptr(y) + tidx;
 407             unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
 408
 409             unsigned int sum = *p;
 410
 411             sums[threadIdx.x][threadIdx.y] = sum;
 412             __syncthreads();
 413
 414             // place into SMEM
 415             // shfl scan reduce the SMEM, reformating so the column
 416             // sums are computed in a warp
 417             // then read out properly
 418             const int j = threadIdx.x % 8;
 419             const int k = threadIdx.x / 8 + threadIdx.y * 4;
 420
 421             int partial_sum = sums[k][j];
 422
 423             for (int i = 1; i <= 8; i *= 2)
 424             {
 425                 int n = __shfl_up(partial_sum, i, 32);
 426
 427                 if (lane_id >= i)
 428                     partial_sum += n;
 429             }
 430
 431             sums[k][j] = partial_sum;
 432             __syncthreads();
 433
 434             if (threadIdx.y > 0)
 435                 sum += sums[threadIdx.x][threadIdx.y - 1];
 436
 437             sum += stepSum;
 438             stepSum += sums[threadIdx.x][blockDim.y - 1];
 439
 440             __syncthreads();
 441
 442             *dst = sum;
 443         }
 444     #endif
 445     }
 446
 447     // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
 448     void shfl_integral_gpu_buffered(cv::cuda::PtrStepSzb img, cv::cuda::PtrStepSz<uint4> buffer, cv::cuda::PtrStepSz<unsigned int> integral,
 449         int blockStep, cudaStream_t stream)
 450     {
 451         {
 452             const int block = blockStep;
 453             const int grid = img.rows;
 454
 455             cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
 456
 457             shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<uint4>) img, buffer);
 458             cudaSafeCall( cudaGetLastError() );
 459         }
 460
 461         {
 462             const dim3 block(32, 8);
 463             const dim3 grid(cv::cuda::device::divUp(integral.cols, block.x), 1);
 464
 465             shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<unsigned int>)buffer, integral);
 466             cudaSafeCall( cudaGetLastError() );
 467         }
 468     }
 469     // 0
 470 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
 471
 472     enum
 473     {
 474         yuv_shift  = 14,
 475         xyz_shift  = 12,
 476         R2Y        = 4899,
 477         G2Y        = 9617,
 478         B2Y        = 1868
 479     };
 480
 481     template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(unsigned char b, unsigned char g, unsigned char r)
 482     {
 483         // uint b = 0xffu & (src >> (bidx * 8));
 484         // uint g = 0xffu & (src >> 8);
 485         // uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
 486         return CV_DESCALE((unsigned int)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
 487     }
 488
 489     __global__ void device_transform(const cv::cuda::PtrStepSz<uchar3> bgr, cv::cuda::PtrStepSzb gray)
 490     {
 491         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 492         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 493
 494         const uchar3 colored = (uchar3)(bgr.ptr(y))[x];
 495
 496         gray.ptr(y)[x] = RGB2GrayConvert<0>(colored.x, colored.y, colored.z);
 497     }
 498
 499     ///////
 500     void transform(const cv::cuda::PtrStepSz<uchar3>& bgr, cv::cuda::PtrStepSzb gray)
 501     {
 502         const dim3 block(32, 8);
 503         const dim3 grid(cv::cuda::device::divUp(bgr.cols, block.x), cv::cuda::device::divUp(bgr.rows, block.y));
 504         device_transform<<<grid, block>>>(bgr, gray);
 505         cudaSafeCall(cudaDeviceSynchronize());
 506     }
 507 }}}