1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "opencv2/core/cuda_types.hpp"
44 #include "opencv2/core/cuda/common.hpp"
46 namespace cv { namespace softcascade { namespace cudev
48 typedef unsigned int uint;
49 typedef unsigned short ushort;
51 // Utility function to extract unsigned chars from an unsigned integer
52 __device__ uchar4 int_to_uchar4(unsigned int in)
55 bytes.x = (in & 0x000000ff) >> 0;
56 bytes.y = (in & 0x0000ff00) >> 8;
57 bytes.z = (in & 0x00ff0000) >> 16;
58 bytes.w = (in & 0xff000000) >> 24;
62 __global__ void shfl_integral_horizontal(const cv::cuda::PtrStep<uint4> img, cv::cuda::PtrStep<uint4> integral)
64 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
65 __shared__ int sums[128];
67 const int id = threadIdx.x;
68 const int lane_id = id % warpSize;
69 const int warp_id = id / warpSize;
71 const uint4 data = img(blockIdx.x, id);
73 const uchar4 a = int_to_uchar4(data.x);
74 const uchar4 b = int_to_uchar4(data.y);
75 const uchar4 c = int_to_uchar4(data.z);
76 const uchar4 d = int_to_uchar4(data.w);
81 result[1] = result[0] + a.y;
82 result[2] = result[1] + a.z;
83 result[3] = result[2] + a.w;
85 result[4] = result[3] + b.x;
86 result[5] = result[4] + b.y;
87 result[6] = result[5] + b.z;
88 result[7] = result[6] + b.w;
90 result[8] = result[7] + c.x;
91 result[9] = result[8] + c.y;
92 result[10] = result[9] + c.z;
93 result[11] = result[10] + c.w;
95 result[12] = result[11] + d.x;
96 result[13] = result[12] + d.y;
97 result[14] = result[13] + d.z;
98 result[15] = result[14] + d.w;
100 int sum = result[15];
102 // the prefix sum for each thread's 16 value is computed,
103 // now the final sums (result[15]) need to be shared
104 // with the other threads and add. To do this,
105 // the __shfl_up() instruction is used and a shuffle scan
106 // operation is performed to distribute the sums to the correct
109 for (int i = 1; i < 32; i *= 2)
111 const int n = __shfl_up(sum, i, 32);
116 for (int i = 0; i < 16; ++i)
123 // Now the final sum for the warp must be shared
124 // between warps. This is done by each warp
125 // having a thread store to shared memory, then
126 // having some other warp load the values and
127 // compute a prefix sum, again by using __shfl_up.
128 // The results are uniformly added back to the warps.
129 // last thread in the warp holding sum of the warp
130 // places that in shared
131 if (threadIdx.x % warpSize == warpSize - 1)
132 sums[warp_id] = result[15];
138 int warp_sum = sums[lane_id];
141 for (int i = 1; i <= 32; i *= 2)
143 const int n = __shfl_up(warp_sum, i, 32);
149 sums[lane_id] = warp_sum;
156 // fold in unused warp
159 blockSum = sums[warp_id - 1];
162 for (int i = 0; i < 16; ++i)
163 result[i] += blockSum;
167 // Each thread has 16 values to write, which are
168 // now integer data (to avoid overflow). Instead of
169 // each thread writing consecutive uint4s, the
170 // approach shown here experiments using
171 // the shuffle command to reformat the data
172 // inside the registers so that each thread holds
173 // consecutive data to be written so larger contiguous
174 // segments can be assembled for writing.
177 For example data that needs to be written as
179 GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
180 but is stored in registers (r0..r3), in four threads (0..3) as:
188 after apply __shfl_xor operations to move data between registers r1..r3:
196 and now x0..x3, and z0..z3 can be written out in order by all threads.
198 In the current code, each register above is actually representing
199 four integers to be written as uint4's to GMEM.
202 result[4] = __shfl_xor(result[4] , 1, 32);
203 result[5] = __shfl_xor(result[5] , 1, 32);
204 result[6] = __shfl_xor(result[6] , 1, 32);
205 result[7] = __shfl_xor(result[7] , 1, 32);
207 result[8] = __shfl_xor(result[8] , 2, 32);
208 result[9] = __shfl_xor(result[9] , 2, 32);
209 result[10] = __shfl_xor(result[10], 2, 32);
210 result[11] = __shfl_xor(result[11], 2, 32);
212 result[12] = __shfl_xor(result[12], 3, 32);
213 result[13] = __shfl_xor(result[13], 3, 32);
214 result[14] = __shfl_xor(result[14], 3, 32);
215 result[15] = __shfl_xor(result[15], 3, 32);
217 uint4* integral_row = integral.ptr(blockIdx.x);
222 if (threadIdx.x % 4 == 0)
223 output = make_uint4(result[0], result[1], result[2], result[3]);
225 if (threadIdx.x % 4 == 1)
226 output = make_uint4(result[4], result[5], result[6], result[7]);
228 if (threadIdx.x % 4 == 2)
229 output = make_uint4(result[8], result[9], result[10], result[11]);
231 if (threadIdx.x % 4 == 3)
232 output = make_uint4(result[12], result[13], result[14], result[15]);
234 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
238 if (threadIdx.x % 4 == 2)
239 output = make_uint4(result[0], result[1], result[2], result[3]);
241 if (threadIdx.x % 4 == 3)
242 output = make_uint4(result[4], result[5], result[6], result[7]);
244 if (threadIdx.x % 4 == 0)
245 output = make_uint4(result[8], result[9], result[10], result[11]);
247 if (threadIdx.x % 4 == 1)
248 output = make_uint4(result[12], result[13], result[14], result[15]);
250 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
252 // continuning from the above example,
253 // this use of __shfl_xor() places the y0..y3 and w0..w3 data
257 for (int i = 0; i < 16; ++i)
258 result[i] = __shfl_xor(result[i], 1, 32);
260 if (threadIdx.x % 4 == 0)
261 output = make_uint4(result[0], result[1], result[2], result[3]);
263 if (threadIdx.x % 4 == 1)
264 output = make_uint4(result[4], result[5], result[6], result[7]);
266 if (threadIdx.x % 4 == 2)
267 output = make_uint4(result[8], result[9], result[10], result[11]);
269 if (threadIdx.x % 4 == 3)
270 output = make_uint4(result[12], result[13], result[14], result[15]);
272 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
276 if (threadIdx.x % 4 == 2)
277 output = make_uint4(result[0], result[1], result[2], result[3]);
279 if (threadIdx.x % 4 == 3)
280 output = make_uint4(result[4], result[5], result[6], result[7]);
282 if (threadIdx.x % 4 == 0)
283 output = make_uint4(result[8], result[9], result[10], result[11]);
285 if (threadIdx.x % 4 == 1)
286 output = make_uint4(result[12], result[13], result[14], result[15]);
288 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
292 // This kernel computes columnwise prefix sums. When the data input is
293 // the row sums from above, this completes the integral image.
294 // The approach here is to have each block compute a local set of sums.
295 // First , the data covered by the block is loaded into shared memory,
296 // then instead of performing a sum in shared memory using __syncthreads
297 // between stages, the data is reformatted so that the necessary sums
298 // occur inside warps and the shuffle scan operation is used.
299 // The final set of sums from the block is then propgated, with the block
300 // computing "down" the image and adding the running sum to the local
302 __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> integral)
304 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
305 __shared__ unsigned int sums[32][9];
307 const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
308 const int lane_id = tidx % 8;
310 if (tidx >= integral.cols)
313 sums[threadIdx.x][threadIdx.y] = 0;
316 unsigned int stepSum = 0;
318 for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
320 unsigned int* p = integral.ptr(y) + tidx;
322 unsigned int sum = *p;
324 sums[threadIdx.x][threadIdx.y] = sum;
328 // shfl scan reduce the SMEM, reformating so the column
329 // sums are computed in a warp
330 // then read out properly
331 const int j = threadIdx.x % 8;
332 const int k = threadIdx.x / 8 + threadIdx.y * 4;
334 int partial_sum = sums[k][j];
336 for (int i = 1; i <= 8; i *= 2)
338 int n = __shfl_up(partial_sum, i, 32);
344 sums[k][j] = partial_sum;
348 sum += sums[threadIdx.x][threadIdx.y - 1];
351 stepSum += sums[threadIdx.x][blockDim.y - 1];
360 void shfl_integral(const cv::cuda::PtrStepSzb& img, cv::cuda::PtrStepSz<unsigned int> integral, cudaStream_t stream)
363 // each thread handles 16 values, use 1 block/row
364 // save, becouse step is actually can't be less 512 bytes
365 int block = integral.cols / 16;
367 // launch 1 block / row
368 const int grid = img.rows;
370 cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
372 shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::cuda::PtrStepSz<uint4>) img, (cv::cuda::PtrStepSz<uint4>) integral);
373 cudaSafeCall( cudaGetLastError() );
377 const dim3 block(32, 8);
378 const dim3 grid(cv::cuda::device::divUp(integral.cols, block.x), 1);
380 shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
381 cudaSafeCall( cudaGetLastError() );
385 cudaSafeCall( cudaDeviceSynchronize() );
388 __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> buffer, cv::cuda::PtrStepSz<unsigned int> integral)
390 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
391 __shared__ unsigned int sums[32][9];
393 const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
394 const int lane_id = tidx % 8;
396 if (tidx >= integral.cols)
399 sums[threadIdx.x][threadIdx.y] = 0;
402 unsigned int stepSum = 0;
404 for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
406 unsigned int* p = buffer.ptr(y) + tidx;
407 unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
409 unsigned int sum = *p;
411 sums[threadIdx.x][threadIdx.y] = sum;
415 // shfl scan reduce the SMEM, reformating so the column
416 // sums are computed in a warp
417 // then read out properly
418 const int j = threadIdx.x % 8;
419 const int k = threadIdx.x / 8 + threadIdx.y * 4;
421 int partial_sum = sums[k][j];
423 for (int i = 1; i <= 8; i *= 2)
425 int n = __shfl_up(partial_sum, i, 32);
431 sums[k][j] = partial_sum;
435 sum += sums[threadIdx.x][threadIdx.y - 1];
438 stepSum += sums[threadIdx.x][blockDim.y - 1];
447 // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
448 void shfl_integral_gpu_buffered(cv::cuda::PtrStepSzb img, cv::cuda::PtrStepSz<uint4> buffer, cv::cuda::PtrStepSz<unsigned int> integral,
449 int blockStep, cudaStream_t stream)
452 const int block = blockStep;
453 const int grid = img.rows;
455 cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
457 shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<uint4>) img, buffer);
458 cudaSafeCall( cudaGetLastError() );
462 const dim3 block(32, 8);
463 const dim3 grid(cv::cuda::device::divUp(integral.cols, block.x), 1);
465 shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<unsigned int>)buffer, integral);
466 cudaSafeCall( cudaGetLastError() );
470 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
481 template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(unsigned char b, unsigned char g, unsigned char r)
483 // uint b = 0xffu & (src >> (bidx * 8));
484 // uint g = 0xffu & (src >> 8);
485 // uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
486 return CV_DESCALE((unsigned int)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
489 __global__ void device_transform(const cv::cuda::PtrStepSz<uchar3> bgr, cv::cuda::PtrStepSzb gray)
491 const int y = blockIdx.y * blockDim.y + threadIdx.y;
492 const int x = blockIdx.x * blockDim.x + threadIdx.x;
494 const uchar3 colored = (uchar3)(bgr.ptr(y))[x];
496 gray.ptr(y)[x] = RGB2GrayConvert<0>(colored.x, colored.y, colored.z);
500 void transform(const cv::cuda::PtrStepSz<uchar3>& bgr, cv::cuda::PtrStepSzb gray)
502 const dim3 block(32, 8);
503 const dim3 grid(cv::cuda::device::divUp(bgr.cols, block.x), cv::cuda::device::divUp(bgr.rows, block.y));
504 device_transform<<<grid, block>>>(bgr, gray);
505 cudaSafeCall(cudaDeviceSynchronize());