/////////////////////////////////////////////////////////////////////////
+namespace hist
+{
+ __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
+ {
+ if (data >= lowerLevel && data <= upperLevel)
+ {
+ const uint ind = (data - lowerLevel) / binSize;
+ Emulation::smem::atomicAdd(shist + ind, 1);
+ }
+ }
+
+ __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
+ int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+ {
+ extern __shared__ int shist[];
+
+ const int y = blockIdx.x * blockDim.y + threadIdx.y;
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+ if (tid < binCount)
+ shist[tid] = 0;
+
+ __syncthreads();
+
+ if (y < rows)
+ {
+ const uchar* rowPtr = src + y * step;
+ const uint* rowPtr4 = (uint*) rowPtr;
+
+ const int cols_4 = cols / 4;
+ for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+ {
+ const uint data = rowPtr4[x];
+
+ histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
+ histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
+ histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
+ histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
+ }
+
+ if (cols % 4 != 0 && threadIdx.x == 0)
+ {
+ for (int x = cols_4 * 4; x < cols; ++x)
+ {
+ const uchar data = rowPtr[x];
+ histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ if (tid < binCount)
+ {
+ const int histVal = shist[tid];
+
+ if (histVal > 0)
+ ::atomicAdd(hist + tid, histVal);
+ }
+ }
+
+ void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+ {
+ const dim3 block(32, 8);
+ const dim3 grid(divUp(src.rows, block.y));
+
+ const int binSize = divUp(upperLevel - lowerLevel, binCount);
+
+ const size_t smem_size = binCount * sizeof(int);
+
+ histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+ cudaSafeCall( cudaGetLastError() );
+
+ if (stream == 0)
+ cudaSafeCall( cudaDeviceSynchronize() );
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
namespace hist
{
__constant__ int c_lut[256];
histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
}
+namespace hist
+{
+ void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+}
+
+namespace
+{
+ void histEven8u(const GpuMat& src, GpuMat& hist, GpuMat&, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+ {
+ hist.create(1, histSize, CV_32S);
+ cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
+ hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+ }
+}
+
void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
{
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
static const hist_t hist_callers[] =
{
- NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
+ histEven8u,
0,
NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist