cudaStream_t stream = StreamAccessor::getStream(s);\r
\r
DeviceInfo info;\r
+ cv::Size whole;\r
+ cv::Point offset;\r
\r
- if (info.supports(WARP_SHUFFLE_FUNCTIONS))\r
+ src.locateROI(whole, offset);\r
+\r
+ if (info.supports(WARP_SHUFFLE_FUNCTIONS) )\r
{\r
GpuMat srcAlligned;\r
\r
- if (src.cols % 16 == 0 && src.rows % 8 == 0)\r
+ if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0)\r
srcAlligned = src;\r
else\r
{\r
srcAlligned = buffer;\r
}\r
\r
- sum.create(srcAlligned.rows + 1, srcAlligned.cols + 1, CV_32SC1);\r
+ sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1);\r
\r
if (s)\r
s.enqueueMemSet(sum, Scalar::all(0));\r
else\r
sum.setTo(Scalar::all(0));\r
\r
- GpuMat inner = sum(Rect(1, 1, srcAlligned.cols, srcAlligned.rows));\r
+ GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows));\r
\r
cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream);\r
\r
- if (srcAlligned.data != src.data)\r
- sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));\r
+ sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1));\r
}\r
else\r
{\r