From f38596b783911dd207c1b121f2e32f32ad88d48d Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 10 Oct 2011 11:58:47 +0000 Subject: [PATCH] implemented brute force convolve for small kernel sizes --- modules/gpu/perf/perf_imgproc.cpp | 11 +++--- modules/gpu/src/cuda/imgproc.cu | 78 +++++++++++++++++++++++++++++++++++++++ modules/gpu/src/imgproc.cpp | 22 +++++++++++ modules/gpu/test/test_imgproc.cpp | 55 +++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 6 deletions(-) diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 81dd559..3f72641 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -737,7 +737,7 @@ PERF_TEST_P(DevInfo_Size, dft, testing::Combine(testing::ValuesIn(devices()), PERF_TEST_P(DevInfo_Int_Int, convolve, testing::Combine(testing::ValuesIn(devices()), testing::Values(512, 1024, 1536, 2048, 2560, 3072, 3584), - testing::Values(27, 32, 64))) + testing::Values(3, 9, 27, 32, 64))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); int image_size = std::tr1::get<1>(GetParam()); @@ -745,13 +745,12 @@ PERF_TEST_P(DevInfo_Int_Int, convolve, testing::Combine(testing::ValuesIn(device setDevice(devInfo.deviceID()); - Mat image_host(image_size, image_size, CV_32FC1); - Mat templ_host(templ_size, templ_size, CV_32FC1); + GpuMat image = createContinuous(image_size, image_size, CV_32FC1); + GpuMat templ = createContinuous(templ_size, templ_size, CV_32FC1); - declare.in(image_host, templ_host, WARMUP_RNG); + image.setTo(Scalar(1.0)); + templ.setTo(Scalar(1.0)); - GpuMat image(image_host); - GpuMat templ(templ_host); GpuMat dst; ConvolveBuf buf; diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 79b5e10..d2ce30e 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -951,6 +951,84 @@ namespace cv { namespace gpu { namespace imgproc } + ////////////////////////////////////////////////////////////////////////// + // convolve + + #define CONVOLVE_MAX_KERNEL_SIZE 17 + + __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE]; + + __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight) + { + __shared__ float smem[16 + 2 * 8][16 + 2 * 8]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + // x | x 0 | 0 + // ----------- + // x | x 0 | 0 + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + smem[threadIdx.y][threadIdx.x] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(max(x - 8, 0), src.cols - 1)]; + + // 0 | 0 x | x + // ----------- + // 0 | 0 x | x + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + smem[threadIdx.y][threadIdx.x + 16] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(x + 8, src.cols - 1)]; + + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + // x | x 0 | 0 + // ----------- + // x | x 0 | 0 + smem[threadIdx.y + 16][threadIdx.x] = src.ptr(min(y + 8, src.rows - 1))[min(max(x - 8, 0), src.cols - 1)]; + + // 0 | 0 0 | 0 + // ----------- + // 0 | 0 0 | 0 + // 0 | 0 x | x + // ----------- + // 0 | 0 x | x + smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(min(y + 8, src.rows - 1))[min(x + 8, src.cols - 1)]; + + __syncthreads(); + + if (x < src.cols && y < src.rows) + { + float res = 0; + + for (int i = 0; i < kHeight; ++i) + { + for (int j = 0; j < kWidth; ++j) + { + res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j]; + } + } + + dst.ptr(y)[x] = res; + } + } + + void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel) + { + cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + + const dim3 block(16, 16); + const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + + convolve<<>>(src, dst, kWidth, kHeight); + cudaSafeCall(cudaGetLastError()); + + cudaSafeCall(cudaDeviceSynchronize()); + } + + }}} diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 47b0998..35878c4 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -1576,6 +1576,10 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, convolve(image, templ, result, ccorr, buf); } +namespace cv { namespace gpu { namespace imgproc +{ + void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel); +}}} void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf) @@ -1586,6 +1590,24 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, CV_Assert(image.type() == CV_32F); CV_Assert(templ.type() == CV_32F); + if (templ.cols < 13 && templ.rows < 13) + { + result.create(image.size(), CV_32F); + GpuMat contKernel; + + if (templ.isContinuous()) + contKernel = templ; + else + { + contKernel = createContinuous(templ.size(), templ.type()); + templ.copyTo(contKernel); + } + + imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr()); + + return; + } + buf.create(image.size(), templ.size()); result.create(buf.result_size, CV_32F); diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index 30dbb9b..1ff6cf2 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -4221,4 +4221,59 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine( testing::Values(3, 5), testing::Values(false, true))); +//////////////////////////////////////////////////////// +// convolve + +struct Convolve: testing::TestWithParam< std::tr1::tuple > +{ + cv::gpu::DeviceInfo devInfo; + int ksize; + + cv::Size size; + cv::Mat src; + cv::Mat kernel; + + cv::Mat dst_gold; + + virtual void SetUp() + { + devInfo = std::tr1::get<0>(GetParam()); + ksize = std::tr1::get<1>(GetParam()); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + + src = cvtest::randomMat(rng, size, CV_32FC1, 0.0, 255.0, false); + kernel = cvtest::randomMat(rng, cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0, false); + + cv::filter2D(src, dst_gold, CV_32F, kernel, cv::Point(-1, -1), 0, cv::BORDER_REPLICATE); + } +}; + +TEST_P(Convolve, Accuracy) +{ + PRINT_PARAM(devInfo); + PRINT_PARAM(ksize); + + cv::Mat dst; + + ASSERT_NO_THROW( + cv::gpu::GpuMat d_dst; + + cv::gpu::convolve(cv::gpu::GpuMat(src), cv::gpu::GpuMat(kernel), d_dst); + + d_dst.download(dst); + ); + + EXPECT_MAT_NEAR(dst, dst_gold, 1e-2); +} + + +INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine( + testing::ValuesIn(devices()), + testing::Values(3, 5, 7, 9, 11))); + #endif // HAVE_CUDA -- 2.7.4