--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or bpied warranties, including, but not limited to, the bpied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "internal_shared.hpp"\r
+#include "opencv2/gpu/device/saturate_cast.hpp"\r
+\r
+using namespace cv::gpu;\r
+\r
+using namespace cv::gpu::device;\r
+\r
+#define UINT_BITS 32U\r
+\r
+#define LOG2_WARP_SIZE 5U\r
+#define WARP_SIZE (1U << LOG2_WARP_SIZE)\r
+\r
+//Warps == subhistograms per threadblock\r
+#define WARP_COUNT 6\r
+\r
+//Threadblock size\r
+#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)\r
+#define HISTOGRAM256_BIN_COUNT 256\r
+\r
+//Shared memory per threadblock\r
+#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)\r
+\r
+#define PARTIAL_HISTOGRAM256_COUNT 240\r
+\r
+#define MERGE_THREADBLOCK_SIZE 256\r
+\r
+#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)\r
+\r
+namespace cv { namespace gpu { namespace histograms\r
+{\r
+ #if (!USE_SMEM_ATOMICS)\r
+\r
+ #define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )\r
+\r
+ __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)\r
+ {\r
+ uint count;\r
+ do\r
+ {\r
+ count = s_WarpHist[data] & TAG_MASK;\r
+ count = threadTag | (count + 1);\r
+ s_WarpHist[data] = count;\r
+ } while (s_WarpHist[data] != count);\r
+ }\r
+\r
+ #else\r
+\r
+ #define TAG_MASK 0xFFFFFFFFU\r
+\r
+ __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)\r
+ {\r
+ atomicAdd(s_WarpHist + data, 1);\r
+ }\r
+\r
+ #endif\r
+\r
+ __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)\r
+ {\r
+ uint x = pos_x << 2;\r
+\r
+ if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);\r
+ if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);\r
+ if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);\r
+ if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);\r
+ }\r
+\r
+ __global__ void histogram256(PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)\r
+ {\r
+ //Per-warp subhistogram storage\r
+ __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];\r
+ uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;\r
+\r
+ //Clear shared memory storage for current threadblock before processing\r
+ #pragma unroll\r
+ for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)\r
+ s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;\r
+\r
+ //Cycle through the entire data set, update subhistograms for each warp\r
+ const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);\r
+\r
+ __syncthreads();\r
+ const uint colsui = d_Data.step / sizeof(uint);\r
+ for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)\r
+ {\r
+ uint pos_y = pos / colsui;\r
+ uint pos_x = pos % colsui;\r
+ uint data = d_Data.ptr(pos_y)[pos_x];\r
+ addWord(s_WarpHist, data, tag, pos_x, cols);\r
+ }\r
+\r
+ //Merge per-warp histograms into per-block and write to global memory\r
+ __syncthreads();\r
+ for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)\r
+ {\r
+ uint sum = 0;\r
+\r
+ for (uint i = 0; i < WARP_COUNT; i++)\r
+ sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;\r
+\r
+ d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;\r
+ }\r
+ }\r
+\r
+ ////////////////////////////////////////////////////////////////////////////////\r
+ // Merge histogram256() output\r
+ // Run one threadblock per bin; each threadblock adds up the same bin counter\r
+ // from every partial histogram. Reads are uncoalesced, but mergeHistogram256\r
+ // takes only a fraction of total processing time\r
+ ////////////////////////////////////////////////////////////////////////////////\r
+\r
+ __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)\r
+ {\r
+ uint sum = 0;\r
+\r
+ #pragma unroll\r
+ for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)\r
+ sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];\r
+\r
+ __shared__ uint data[MERGE_THREADBLOCK_SIZE];\r
+ data[threadIdx.x] = sum;\r
+\r
+ for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)\r
+ {\r
+ __syncthreads();\r
+ if(threadIdx.x < stride)\r
+ data[threadIdx.x] += data[threadIdx.x + stride];\r
+ }\r
+\r
+ if(threadIdx.x == 0)\r
+ d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);\r
+ }\r
+\r
+ void histogram256_gpu(DevMem2D src, int* hist, uint* buf, cudaStream_t stream)\r
+ {\r
+ histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
+ DevMem2D_<uint>(src),\r
+ buf, \r
+ src.rows * src.step / sizeof(uint),\r
+ src.cols);\r
+\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);\r
+\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+}}}\r
void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); }\r
void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }\r
void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); }\r
+void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }\r
+void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }\r
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }\r
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }\r
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }\r
hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream));\r
}\r
\r
+namespace cv { namespace gpu { namespace histograms\r
+{\r
+ void histogram256_gpu(DevMem2D src, int* hist, unsigned int* buf, cudaStream_t stream);\r
+\r
+ const int PARTIAL_HISTOGRAM256_COUNT = 240;\r
+ const int HISTOGRAM256_BIN_COUNT = 256;\r
+}}}\r
+\r
+void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)\r
+{\r
+ GpuMat buf;\r
+ calcHist(src, hist, buf, stream);\r
+}\r
+\r
+void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)\r
+{\r
+ using namespace cv::gpu::histograms;\r
+\r
+ CV_Assert(src.type() == CV_8UC1);\r
+\r
+ hist.create(1, 256, CV_32SC1);\r
+\r
+ ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);\r
+\r
+ histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));\r
+}\r
+\r
////////////////////////////////////////////////////////////////////////\r
// cornerHarris & minEgenVal\r
\r
///////////////////////////////////////////////////////////////////////////////////////////////////////\r
// histograms\r
\r
-struct Histograms : testing::TestWithParam<cv::gpu::DeviceInfo>\r
+struct HistEven : testing::TestWithParam<cv::gpu::DeviceInfo>\r
{\r
static cv::Mat hsv;\r
\r
}\r
};\r
\r
-cv::Mat Histograms::hsv;\r
+cv::Mat HistEven::hsv;\r
\r
-TEST_P(Histograms, Accuracy)\r
+TEST_P(HistEven, Accuracy)\r
{\r
ASSERT_TRUE(!hsv.empty());\r
\r
EXPECT_MAT_NEAR(hist_gold, hist, 0.0);\r
}\r
\r
-INSTANTIATE_TEST_CASE_P(ImgProc, Histograms, testing::ValuesIn(devices()));\r
+INSTANTIATE_TEST_CASE_P(ImgProc, HistEven, testing::ValuesIn(devices()));\r
+\r
+struct CalcHist : testing::TestWithParam<cv::gpu::DeviceInfo>\r
+{\r
+ cv::gpu::DeviceInfo devInfo;\r
+\r
+ cv::Size size;\r
+ cv::Mat src;\r
+ cv::Mat hist_gold;\r
+ \r
+ virtual void SetUp()\r
+ {\r
+ devInfo = GetParam();\r
+\r
+ cv::gpu::setDevice(devInfo.deviceID());\r
+\r
+ cv::RNG& rng = cvtest::TS::ptr()->get_rng();\r
+\r
+ size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));\r
+ \r
+ src = cvtest::randomMat(rng, size, CV_8UC1, 0, 255, false);\r
+\r
+ hist_gold.create(1, 256, CV_32SC1);\r
+ hist_gold.setTo(cv::Scalar::all(0));\r
+\r
+ int* hist = hist_gold.ptr<int>();\r
+ for (int y = 0; y < src.rows; ++y)\r
+ {\r
+ const uchar* src_row = src.ptr(y);\r
+\r
+ for (int x = 0; x < src.cols; ++x)\r
+ ++hist[src_row[x]];\r
+ }\r
+ }\r
+};\r
+\r
+TEST_P(CalcHist, Accuracy)\r
+{\r
+ PRINT_PARAM(devInfo);\r
+ PRINT_PARAM(size);\r
+\r
+ cv::Mat hist;\r
+ \r
+ ASSERT_NO_THROW(\r
+ cv::gpu::GpuMat gpuHist;\r
+\r
+ cv::gpu::calcHist(cv::gpu::GpuMat(src), gpuHist);\r
+\r
+ gpuHist.download(hist);\r
+ );\r
+\r
+ EXPECT_MAT_NEAR(hist_gold, hist, 0.0);\r
+}\r
+\r
+INSTANTIATE_TEST_CASE_P(ImgProc, CalcHist, testing::ValuesIn(devices()));\r
\r
///////////////////////////////////////////////////////////////////////////////////////////////////////\r
// cornerHarris\r