src.locateROI(whole, offset);
- if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
+ if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x))
{
- GpuMat srcAlligned;
+ ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
- if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0)
- srcAlligned = src;
- else
- {
- ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer);
-
- GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
-
- if (s)
- {
- s.enqueueMemSet(buffer, Scalar::all(0));
- s.enqueueCopy(src, inner);
- }
- else
- {
- buffer.setTo(Scalar::all(0));
- src.copyTo(inner);
- }
-
- srcAlligned = buffer;
- }
-
- sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1);
+ cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream);
+ sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
if (s)
s.enqueueMemSet(sum, Scalar::all(0));
else
sum.setTo(Scalar::all(0));
- GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows));
-
- cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream);
+ GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
+ GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
- sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1));
+ if (s)
+ s.enqueueCopy(res, inner);
+ else
+ res.copyTo(inner);
}
else
{