From f617f18e46fa556daea060d3c69307567bbc65f7 Mon Sep 17 00:00:00 2001
From: Namgoo Lee <namgoo.lee@cognex.com>
Date: Wed, 19 Aug 2020 17:39:24 +0900
Subject: [PATCH] bit-exact cuda::equalizeHist

---
 modules/cudaimgproc/src/cuda/hist.cu        | 144 +++++++++++++++++++++++++---
 modules/cudaimgproc/src/histogram.cpp       |  16 ++--
 modules/cudaimgproc/test/test_histogram.cpp |  82 +++++++++++++++-
 3 files changed, 217 insertions(+), 25 deletions(-)

diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
index be13091..6bc5f15 100644
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -257,18 +257,15 @@ namespace hist
 
 namespace hist
 {
-    __constant__ int c_lut[256];
-
     struct EqualizeHist : unary_function<uchar, uchar>
     {
-        float scale;
+        const uchar* lut;
 
-        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+        __host__ EqualizeHist(const uchar* _lut) : lut(_lut) {}
 
         __device__ __forceinline__ uchar operator ()(uchar val) const
         {
-            const int lut = c_lut[val];
-            return __float2int_rn(scale * lut);
+            return lut[val];
         }
     };
 }
@@ -283,16 +280,137 @@ namespace cv { namespace cuda { namespace device
 
 namespace hist
 {
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream)
     {
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
+        device::transform(src, dst, EqualizeHist(lut), WithOutMask(), stream);
+    }
+
+    __global__ void buildLutKernel(int* hist, unsigned char* lut, int size)
+    {
+        __shared__ int warp_smem[8];
+        __shared__ int hist_smem[8][33];
+
+#define HIST_SMEM_NO_BANK_CONFLICT(idx) hist_smem[(idx) >> 5][(idx) & 31]
+
+        const int tId = threadIdx.x;
+        const int warpId = threadIdx.x / 32;
+        const int laneId = threadIdx.x % 32;
+
+        // Step1 - Find minimum non-zero value in hist and make it zero
+        HIST_SMEM_NO_BANK_CONFLICT(tId) = hist[tId];
+        int nonZeroIdx = HIST_SMEM_NO_BANK_CONFLICT(tId) > 0 ? tId : 256;
+
+        __syncthreads();
+
+        for (int delta = 16; delta > 0; delta /= 2)
+        {
+#if __CUDACC_VER_MAJOR__ >= 9
+            int shflVal = __shfl_down_sync(0xFFFFFFFF, nonZeroIdx, delta);
+#else
+            int shflVal = __shfl_down(nonZeroIdx, delta);
+#endif
+            if (laneId < delta)
+                nonZeroIdx = min(nonZeroIdx, shflVal);
+        }
+
+        if (laneId == 0)
+            warp_smem[warpId] = nonZeroIdx;
 
-        const float scale = 255.0f / (src.cols * src.rows);
+        __syncthreads();
+
+        if (tId < 8)
+        {
+            int warpVal = warp_smem[tId];
+            for (int delta = 4; delta > 0; delta /= 2)
+            {
+#if __CUDACC_VER_MAJOR__ >= 9
+                int shflVal = __shfl_down_sync(0x000000FF, warpVal, delta);
+#else
+                int shflVal = __shfl_down(warpVal, delta);
+#endif
+                if (tId < delta)
+                    warpVal = min(warpVal, shflVal);
+            }
+            if (tId == 0)
+            {
+                warp_smem[0] = warpVal; // warpVal - minimum index
+            }
+        }
+
+        __syncthreads();
+
+        const int minNonZeroIdx = warp_smem[0];
+        const int minNonZeroVal = HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx);
+        if (minNonZeroVal == size)
+        {
+            // This is a special case: the whole image has the same color
+
+            lut[tId] = 0;
+            if (tId == minNonZeroIdx)
+                lut[tId] = minNonZeroIdx;
+            return;
+        }
 
-        device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+        if (tId == 0)
+            HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx) = 0;
+
+        __syncthreads();
+
+        // Step2 - Inclusive sum
+        // Algorithm from GPU Gems 3 (A Work-Efficient Parallel Scan)
+        // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
+
+        // Step2 Phase1 - The Up-Sweep Phase
+        for (int delta = 1; delta < 256; delta *= 2)
+        {
+            if (tId < 128 / delta)
+            {
+                int idx = 255 - 2 * tId * delta;
+                HIST_SMEM_NO_BANK_CONFLICT(idx) += HIST_SMEM_NO_BANK_CONFLICT(idx - delta);
+            }
+            __syncthreads();
+        }
+
+        // Step2 Phase2 - The Down-Sweep Phase
+        if (tId == 0)
+            HIST_SMEM_NO_BANK_CONFLICT(255) = 0;
+
+        for (int delta = 128; delta >= 1; delta /= 2)
+        {
+            if (tId < 128 / delta)
+            {
+                int rootIdx = 255 - tId * delta * 2;
+                int leftIdx = rootIdx - delta;
+                int tmp = HIST_SMEM_NO_BANK_CONFLICT(leftIdx);
+                HIST_SMEM_NO_BANK_CONFLICT(leftIdx) = HIST_SMEM_NO_BANK_CONFLICT(rootIdx);
+                HIST_SMEM_NO_BANK_CONFLICT(rootIdx) += tmp;
+            }
+            __syncthreads();
+        }
+
+        // Step2 Phase3 - Convert exclusive sum to inclusive sum
+        int tmp = HIST_SMEM_NO_BANK_CONFLICT(tId);
+        __syncthreads();
+        if (tId >= 1)
+            HIST_SMEM_NO_BANK_CONFLICT(tId - 1) = tmp;
+        if (tId == 255)
+            HIST_SMEM_NO_BANK_CONFLICT(tId) = tmp + hist[tId];
+        __syncthreads();
+
+        // Step3 - Scale values to build lut
+
+        lut[tId] = saturate_cast<unsigned char>(HIST_SMEM_NO_BANK_CONFLICT(tId) * (255.0f / (size - minNonZeroVal)));
+
+#undef HIST_SMEM_NO_BANK_CONFLICT
+    }
+
+    void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream)
+    {
+        buildLutKernel<<<1, 256, 0, stream>>>(hist.data, lut.data, size);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index e616c5a..c252abc 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -102,7 +102,8 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
 
 namespace hist
 {
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream);
+    void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream);
 }
 
 void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
@@ -114,26 +115,21 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
     _dst.create(src.size(), src.type());
     GpuMat dst = _dst.getGpuMat();
 
-    int intBufSize;
-    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
-
-    size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
+    size_t bufSize = 256 * sizeof(int) + 256 * sizeof(uchar);
 
     BufferPool pool(_stream);
     GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
 
     GpuMat hist(1, 256, CV_32SC1, buf.data);
-    GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
-    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int));
+    GpuMat lut(1, 256, CV_8UC1, buf.data + 256 * sizeof(int));
 
     cuda::calcHist(src, hist, _stream);
 
     cudaStream_t stream = StreamAccessor::getStream(_stream);
-    NppStreamHandler h(stream);
 
-    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
+    hist::buildLut(hist, lut, src.rows * src.cols, stream);
 
-    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
+    hist::equalizeHist(src, dst, lut.data, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index 6af8eb2..ac7c5e2 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -208,7 +208,7 @@ CUDA_TEST_P(EqualizeHist, Async)
     cv::Mat dst_gold;
     cv::equalizeHist(src, dst_gold);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
 CUDA_TEST_P(EqualizeHist, Accuracy)
@@ -221,13 +221,91 @@ CUDA_TEST_P(EqualizeHist, Accuracy)
     cv::Mat dst_gold;
     cv::equalizeHist(src, dst_gold);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHist, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES));
 
+TEST(EqualizeHistIssue, Issue18035)
+{
+    std::vector<std::string> imgPaths;
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/3MP.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/5MP.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/airplane.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/baboon.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box_in_scene.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits_ecc.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/graffiti.png");
+    imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/lena.png");
+
+    for (int i = 0; i < imgPaths.size(); ++i)
+    {
+        std::string imgPath = imgPaths[i];
+        cv::Mat src = cv::imread(imgPath, cv::IMREAD_GRAYSCALE);
+        src = src / 30;
+
+        cv::cuda::GpuMat d_src, dst;
+        d_src.upload(src);
+        cv::cuda::equalizeHist(d_src, dst);
+
+        cv::Mat dst_gold;
+        cv::equalizeHist(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+PARAM_TEST_CASE(EqualizeHistExtreme, cv::cuda::DeviceInfo, cv::Size, int)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int val;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        val = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(EqualizeHistExtreme, Case1)
+{
+    cv::Mat src(size, CV_8UC1, val);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::equalizeHist(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::equalizeHist(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(EqualizeHistExtreme, Case2)
+{
+    cv::Mat src = randomMat(size, CV_8UC1, val);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::equalizeHist(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::equalizeHist(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHistExtreme, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Range(0, 256)));
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CLAHE
 
-- 
2.7.4