1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
53 using namespace cv::dnn::ocl4dnn;
60 static inline int scaleAndRoundRoi(float f, float scale)
62 return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
65 class PoolingLayerImpl : public PoolingLayer
68 PoolingLayerImpl(const LayerParams& params)
72 globalPooling = false;
74 if (params.has("pool"))
76 String pool = params.get<String>("pool").toLowerCase();
79 else if (pool == "ave")
81 else if (pool == "stochastic")
84 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
85 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
86 pad.height, pad.width, stride.height, stride.width, padMode);
88 else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
92 setParamsFrom(params);
93 ceilMode = params.get<bool>("ceil_mode", true);
94 pooledSize.width = params.get<uint32_t>("pooled_w", 1);
95 pooledSize.height = params.get<uint32_t>("pooled_h", 1);
96 spatialScale = params.get<float>("spatial_scale", 1);
100 Ptr<OCL4DNNPool<float> > poolOp;
103 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
105 CV_Assert(!inputs.empty());
107 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
108 out(outputs[0].size[3], outputs[0].size[2]);
115 getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
118 virtual bool supportBackend(int backendId)
120 return backendId == DNN_BACKEND_DEFAULT ||
121 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
122 (type == MAX || type == AVE && !pad.width && !pad.height);
126 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
128 std::vector<UMat> inputs;
129 std::vector<UMat> outputs;
131 inps.getUMatVector(inputs);
132 outs.getUMatVector(outputs);
136 OCL4DNNPoolConfig config;
138 config.in_shape = shape(inputs[0]);
139 config.out_shape = shape(outputs[0]);
140 config.kernel = kernel;
142 config.stride = stride;
143 config.channels = inputs[0].size[1];
144 config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
145 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
146 LIBDNN_POOLING_METHOD_STO);
147 poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
150 for (size_t ii = 0; ii < inputs.size(); ii++)
152 UMat& inpMat = inputs[ii];
153 int out_index = (type == MAX) ? 2 : 1;
154 UMat& outMat = outputs[out_index * ii];
155 UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
157 CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
159 if (!poolOp->Forward(inpMat, outMat, maskMat))
167 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
170 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
172 CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
173 OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
174 forward_ocl(inputs_arr, outputs_arr, internals_arr))
176 Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
179 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
182 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
187 CV_Assert(inputs.size() == 1, outputs.size() == 2);
188 maxPooling(*inputs[0], outputs[0], outputs[1]);
191 CV_Assert(inputs.size() == 1, outputs.size() == 1);
192 avePooling(*inputs[0], outputs[0]);
195 CV_Assert(inputs.size() == 2, outputs.size() == 1);
196 roiPooling(*inputs[0], *inputs[1], outputs[0]);
199 CV_Error(Error::StsNotImplemented, "Not implemented");
204 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
207 return initMaxPoolingHalide(inputs);
208 else if (type == AVE)
209 return initAvePoolingHalide(inputs);
211 return Ptr<BackendNode>();
214 class PoolingInvoker : public ParallelLoopBody
217 const Mat* src, *rois;
219 Size kernel, stride, pad;
222 std::vector<int> ofsbuf;
226 PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0),
227 computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
229 static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
230 Size stride, Size pad, int poolingType, float spatialScale,
231 bool computeMaxIdx, int nstripes)
233 CV_Assert(src.isContinuous() && dst.isContinuous() &&
234 src.type() == CV_32F && src.type() == dst.type() &&
235 src.dims == 4 && dst.dims == 4 &&
236 (poolingType == ROI && dst.size[0] == rois.size[0] ||
237 src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
238 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
249 p.nstripes = nstripes;
250 p.computeMaxIdx = computeMaxIdx;
251 p.poolingType = poolingType;
252 p.spatialScale = spatialScale;
256 p.ofsbuf.resize(kernel.width*kernel.height);
257 for( int i = 0; i < kernel.height; i++ )
258 for( int j = 0; j < kernel.width; j++ )
259 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
262 parallel_for_(Range(0, nstripes), p, nstripes);
265 void operator()(const Range& r) const
267 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
268 int inp_width = src->size[3], inp_height = src->size[2];
269 size_t total = dst->total();
270 size_t stripeSize = (total + nstripes - 1)/nstripes;
271 size_t stripeStart = r.start*stripeSize;
272 size_t stripeEnd = std::min(r.end*stripeSize, total);
273 int kernel_w = kernel.width, kernel_h = kernel.height;
274 int pad_w = pad.width, pad_h = pad.height;
275 int stride_w = stride.width, stride_h = stride.height;
276 bool compMaxIdx = computeMaxIdx;
279 const int* ofsptr = &ofsbuf[0];
280 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
281 v_float32x4 ones = v_setall_f32(1.f);
282 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
285 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
288 int x0 = (int)(ofs % width);
290 int y0 = (int)(ofs % height);
292 int c = (int)(ofs % channels);
293 int n = (int)(ofs / channels);
296 const float *srcData;
299 if (poolingType == ROI)
301 const float *roisData = rois->ptr<float>(n);
302 int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
303 int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
304 int roiHeight = std::max(yendROI - ystartROI + 1, 1);
305 roiRatio = (float)roiHeight / height;
307 ystart = ystartROI + y0 * roiRatio;
308 yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
310 xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
311 int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
312 int roiWidth = std::max(xendROI - xstartROI + 1, 1);
313 roiRatio = (float)roiWidth / width;
315 CV_Assert(roisData[0] < src->size[0]);
316 srcData = src->ptr<float>(roisData[0], c);
320 ystart = y0 * stride_h - pad_h;
321 yend = min(ystart + kernel_h, inp_height + pad_h);
322 srcData = src->ptr<float>(n, c);
324 int ydelta = yend - ystart;
325 ystart = max(ystart, 0);
326 yend = min(yend, inp_height);
327 float *dstData = dst->ptr<float>(n, c, y0);
328 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
330 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
334 if( poolingType == MAX || poolingType == ROI)
335 for( ; x0 < x1; x0++ )
338 if (poolingType == ROI)
340 xstart = xstartROI + x0 * roiRatio;
341 xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
345 xstart = x0 * stride_w - pad_w;
346 xend = xstart + kernel_w;
348 xstart = max(xstart, 0);
349 xend = min(xend, inp_width);
350 if (xstart >= xend || ystart >= yend)
353 if (compMaxIdx && dstMaskData)
354 dstMaskData[x0] = -1;
358 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
362 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
363 v_float32x4 max_val1 = max_val0;
364 v_float32x4 max_idx0 = v_setall_f32(-1.f);
365 v_float32x4 max_idx1 = max_idx0;
366 int index0 = ystart * inp_width + xstart;
367 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
368 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
370 for (int y = ystart; y < yend; ++y)
372 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
374 const int index = y * inp_width + x;
375 v_float32x4 v0(srcData[index], srcData[index + stride_w],
376 srcData[index + stride_w*2], srcData[index + stride_w*3]);
377 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
378 srcData[index + stride_w*6], srcData[index + stride_w*7]);
379 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
380 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
381 max_val0 = v_max(max_val0, v0);
382 max_val1 = v_max(max_val1, v1);
387 v_store(dstData + x0, max_val0);
388 v_store(dstData + x0 + 4, max_val1);
391 v_store(dstMaskData + x0, max_idx0);
392 v_store(dstMaskData + x0 + 4, max_idx1);
398 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
399 v_float32x4 max_val1 = max_val0;
401 if( yend - ystart == kernel_h )
403 const float* srcData1 = srcData + ystart*inp_width + xstart;
405 for (int k = 0; k < kernel_w*kernel_h; k++)
407 int index = ofsptr[k];
408 v_float32x4 v0 = v_load(srcData1 + index);
409 v_float32x4 v1 = v_load(srcData1 + index + 4);
410 max_val0 = v_max(max_val0, v0);
411 max_val1 = v_max(max_val1, v1);
414 else if( stride_w == 2 )
415 for (int k = 0; k < kernel_w*kernel_h; k++)
417 int index = ofsptr[k];
418 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
419 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
420 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
421 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
422 max_val0 = v_max(max_val0, v0);
423 max_val1 = v_max(max_val1, v1);
427 for (int k = 0; k < kernel_w*kernel_h; k++)
429 int index = ofsptr[k];
430 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
431 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
432 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
433 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
434 max_val0 = v_max(max_val0, v0);
435 max_val1 = v_max(max_val1, v1);
440 for (int y = ystart; y < yend; ++y)
442 for (int x = xstart; x < xend; ++x)
444 const int index = y * inp_width + x;
445 v_float32x4 v0(srcData[index], srcData[index + stride_w],
446 srcData[index + stride_w*2], srcData[index + stride_w*3]);
447 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
448 srcData[index + stride_w*6], srcData[index + stride_w*7]);
449 max_val0 = v_max(max_val0, v0);
450 max_val1 = v_max(max_val1, v1);
454 v_store(dstData + x0, max_val0);
455 v_store(dstData + x0 + 4, max_val1);
462 float max_val = -FLT_MAX;
466 for (int y = ystart; y < yend; ++y)
467 for (int x = xstart; x < xend; ++x)
469 const int index = y * inp_width + x;
470 float val = srcData[index];
478 dstData[x0] = max_val;
480 dstMaskData[x0] = max_index;
484 for (int y = ystart; y < yend; ++y)
485 for (int x = xstart; x < xend; ++x)
487 const int index = y * inp_width + x;
488 float val = srcData[index];
489 max_val = std::max(max_val, val);
492 dstData[x0] = max_val;
498 for( ; x0 < x1; x0++ )
500 int xstart = x0 * stride_w - pad_w;
501 int xend = min(xstart + kernel_w, inp_width + pad_w);
502 int xdelta = xend - xstart;
503 xstart = max(xstart, 0);
504 xend = min(xend, inp_width);
505 float inv_kernel_area = 1.f/(ydelta*xdelta);
508 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
510 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
511 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
513 for (int y = ystart; y < yend; ++y)
515 for (int x = xstart; x < xend; ++x)
517 const int index = y * inp_width + x;
518 v_float32x4 v0(srcData[index], srcData[index + stride_w],
519 srcData[index + stride_w*2], srcData[index + stride_w*3]);
520 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
521 srcData[index + stride_w*6], srcData[index + stride_w*7]);
526 v_store(dstData + x0, sum_val0*ikarea);
527 v_store(dstData + x0 + 4, sum_val1*ikarea);
534 for (int y = ystart; y < yend; ++y)
535 for (int x = xstart; x < xend; ++x)
537 const int index = y * inp_width + x;
538 float val = srcData[index];
542 dstData[x0] = sum_val*inv_kernel_area;
550 void maxPooling(Mat &src, Mat &dst, Mat &mask)
552 const int nstripes = getNumThreads();
554 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
557 void avePooling(Mat &src, Mat &dst)
559 const int nstripes = getNumThreads();
561 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
564 void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
566 const int nstripes = getNumThreads();
568 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
571 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
574 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
575 const int inWidth = inputBuffer.width();
576 const int inHeight = inputBuffer.height();
578 Halide::Var x("x"), y("y"), c("c"), n("n");
579 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
580 Halide::RDom r(0, kernel.width, 0, kernel.height);
582 if (pad.width || pad.height)
584 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
585 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
589 kx = min(x * stride.width + r.x, inWidth - 1);
590 ky = min(y * stride.height + r.y, inHeight - 1);
593 // Halide::argmax returns tuple (r.x, r.y, max).
594 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
596 // Compute offset from argmax in range [0, kernel_size).
597 Halide::Expr max_index;
598 if (pad.width || pad.height)
600 max_index = clamp(y * stride.height + res[1] - pad.height,
601 0, inHeight - 1) * inWidth +
602 clamp(x * stride.width + res[0] - pad.width,
607 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
608 min(x * stride.width + res[0], inWidth - 1);
610 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
611 return Ptr<BackendNode>(new HalideBackendNode(top));
612 #endif // HAVE_HALIDE
613 return Ptr<BackendNode>();
616 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
619 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
621 const int inW = inputBuffer.width(), inH = inputBuffer.height();
622 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
624 CV_Error(cv::Error::StsNotImplemented,
625 "Halide backend for average pooling with partial "
626 "kernels is not implemented");
629 const float norm = 1.0f / (kernel.width * kernel.height);
631 Halide::Var x("x"), y("y"), c("c"), n("n");
632 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
633 Halide::RDom r(0, kernel.width, 0, kernel.height);
634 top(x, y, c, n) = sum(
635 inputBuffer(x * stride.width + r.x,
636 y * stride.height + r.y, c, n)) * norm;
637 return Ptr<BackendNode>(new HalideBackendNode(top));
638 #endif // HAVE_HALIDE
639 return Ptr<BackendNode>();
642 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
643 const std::vector<Mat*> &inputs,
644 const std::vector<Mat> &outputs,
648 if (targetId != DNN_TARGET_CPU)
650 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
653 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
654 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
655 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
657 int outW, outH, outC, outN;
658 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
660 if (outW < 8 || outH < 8)
663 top.split(c, co, ci, 8)
664 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
669 top.fuse(y, c, tile).fuse(n, tile, tile)
678 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
679 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
683 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
684 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
688 #endif // HAVE_HALIDE
691 bool getMemoryShapes(const std::vector<MatShape> &inputs,
692 const int requiredOutputs,
693 std::vector<MatShape> &outputs,
694 std::vector<MatShape> &internals) const
696 CV_Assert(inputs.size() != 0);
697 Size in(inputs[0][3], inputs[0][2]), out;
704 else if (type == ROI)
706 out.height = pooledSize.height;
707 out.width = pooledSize.width;
709 else if (padMode.empty())
711 float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
712 float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
713 out.height = 1 + (ceilMode ? ceil(height) : floor(height));
714 out.width = 1 + (ceilMode ? ceil(width) : floor(width));
716 if (pad.height || pad.width)
718 // If we have padding, ensure that the last pooling starts strictly
719 // inside the image (instead of at the padding); otherwise clip the last.
720 if ((out.height - 1) * stride.height >= in.height + pad.height)
722 if ((out.width - 1) * stride.width >= in.width + pad.width)
724 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
725 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
730 getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
733 int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
736 CV_Assert(inputs.size() == 2);
737 dims[0] = inputs[1][0]; // Number of proposals;
739 outputs.assign(type == MAX ? 2 : 1, shape(dims));
743 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
744 const std::vector<MatShape> &outputs) const
746 (void)inputs; // suppress unused variable warning
749 for(int i = 0; i < outputs.size(); i++)
754 flops += total(outputs[i])*kernel.area();
758 flops += total(outputs[i])*(kernel.area() + 1);
773 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
775 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));