1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
53 using namespace cv::dnn::ocl4dnn;
61 class PoolingLayerImpl : public PoolingLayer
64 PoolingLayerImpl(const LayerParams& params)
66 type = PoolingLayer::MAX;
68 globalPooling = false;
70 if (params.has("pool"))
72 String pool = params.get<String>("pool").toLowerCase();
74 type = PoolingLayer::MAX;
75 else if (pool == "ave")
76 type = PoolingLayer::AVE;
77 else if (pool == "stochastic")
78 type = PoolingLayer::STOCHASTIC;
80 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
81 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
82 pad.height, pad.width, stride.height, stride.width, padMode);
84 else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
86 type = PoolingLayer::ROI;
88 setParamsFrom(params);
89 ceilMode = params.get<bool>("ceil_mode", true);
90 pooledSize.width = params.get<uint32_t>("pooled_w", 1);
91 pooledSize.height = params.get<uint32_t>("pooled_h", 1);
92 spatialScale = params.get<float>("spatial_scale", 1);
96 Ptr<OCL4DNNPool<float> > poolOp;
99 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
101 CV_Assert(!inputs.empty());
103 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
104 out(outputs[0].size[3], outputs[0].size[2]);
111 getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
114 virtual bool supportBackend(int backendId)
116 return backendId == DNN_BACKEND_DEFAULT ||
117 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
118 (type == PoolingLayer::MAX ||
119 type == PoolingLayer::AVE && !pad.width && !pad.height);
123 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
125 std::vector<UMat> inputs;
126 std::vector<UMat> outputs;
128 inps.getUMatVector(inputs);
129 outs.getUMatVector(outputs);
133 OCL4DNNPoolConfig config;
135 config.in_shape = shape(inputs[0]);
136 config.out_shape = shape(outputs[0]);
137 config.kernel = kernel;
139 config.stride = stride;
140 config.channels = inputs[0].size[1];
141 config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
142 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
143 LIBDNN_POOLING_METHOD_STO);
144 poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
147 for (size_t ii = 0; ii < inputs.size(); ii++)
149 UMat& inpMat = inputs[ii];
150 int out_index = (type == MAX) ? 2 : 1;
151 UMat& outMat = outputs[out_index * ii];
152 UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
154 CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
156 if (!poolOp->Forward(inpMat, outMat, maskMat))
164 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
167 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
169 CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
170 OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
171 forward_ocl(inputs_arr, outputs_arr, internals_arr))
173 Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
176 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
179 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
184 CV_Assert(inputs.size() == 1, outputs.size() == 2);
185 maxPooling(*inputs[0], outputs[0], outputs[1]);
188 CV_Assert(inputs.size() == 1, outputs.size() == 1);
189 avePooling(*inputs[0], outputs[0]);
192 CV_Assert(inputs.size() == 2, outputs.size() == 1);
193 roiPooling(*inputs[0], *inputs[1], outputs[0]);
196 CV_Error(Error::StsNotImplemented, "Not implemented");
201 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
203 if (type == PoolingLayer::MAX)
204 return initMaxPoolingHalide(inputs);
205 else if (type == PoolingLayer::AVE)
206 return initAvePoolingHalide(inputs);
208 return Ptr<BackendNode>();
211 class PoolingInvoker : public ParallelLoopBody
214 const Mat* src, *rois;
216 Size kernel, stride, pad;
219 std::vector<int> ofsbuf;
223 PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0),
224 computeMaxIdx(0), poolingType(PoolingLayer::MAX), spatialScale(0) {}
226 static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
227 Size stride, Size pad, int poolingType, float spatialScale,
228 bool computeMaxIdx, int nstripes)
230 CV_Assert(src.isContinuous() && dst.isContinuous() &&
231 src.type() == CV_32F && src.type() == dst.type() &&
232 src.dims == 4 && dst.dims == 4 &&
233 (poolingType == ROI && dst.size[0] == rois.size[0] ||
234 src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
235 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
246 p.nstripes = nstripes;
247 p.computeMaxIdx = computeMaxIdx;
248 p.poolingType = poolingType;
249 p.spatialScale = spatialScale;
253 p.ofsbuf.resize(kernel.width*kernel.height);
254 for( int i = 0; i < kernel.height; i++ )
255 for( int j = 0; j < kernel.width; j++ )
256 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
259 parallel_for_(Range(0, nstripes), p, nstripes);
262 void operator()(const Range& r) const
264 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
265 int inp_width = src->size[3], inp_height = src->size[2];
266 size_t total = dst->total();
267 size_t stripeSize = (total + nstripes - 1)/nstripes;
268 size_t stripeStart = r.start*stripeSize;
269 size_t stripeEnd = std::min(r.end*stripeSize, total);
270 int kernel_w = kernel.width, kernel_h = kernel.height;
271 int pad_w = pad.width, pad_h = pad.height;
272 int stride_w = stride.width, stride_h = stride.height;
273 bool compMaxIdx = computeMaxIdx;
276 const int* ofsptr = &ofsbuf[0];
277 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
278 v_float32x4 ones = v_setall_f32(1.f);
279 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
282 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
285 int x0 = (int)(ofs % width);
287 int y0 = (int)(ofs % height);
289 int c = (int)(ofs % channels);
290 int n = (int)(ofs / channels);
293 const float *srcData;
296 if (poolingType == ROI)
298 const float *roisData = rois->ptr<float>(n);
299 int ystartROI = round(roisData[2] * spatialScale);
300 int yendROI = round(roisData[4] * spatialScale);
301 int roiHeight = std::max(yendROI - ystartROI + 1, 1);
302 roiRatio = (float)roiHeight / height;
304 ystart = ystartROI + y0 * roiRatio;
305 yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
307 xstartROI = round(roisData[1] * spatialScale);
308 int xendROI = round(roisData[3] * spatialScale);
309 int roiWidth = std::max(xendROI - xstartROI + 1, 1);
310 roiRatio = (float)roiWidth / width;
312 CV_Assert(roisData[0] < src->size[0]);
313 srcData = src->ptr<float>(roisData[0], c);
317 ystart = y0 * stride_h - pad_h;
318 yend = min(ystart + kernel_h, inp_height + pad_h);
319 srcData = src->ptr<float>(n, c);
321 int ydelta = yend - ystart;
322 ystart = max(ystart, 0);
323 yend = min(yend, inp_height);
324 float *dstData = dst->ptr<float>(n, c, y0);
325 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
327 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
331 if( poolingType == MAX || poolingType == ROI)
332 for( ; x0 < x1; x0++ )
335 if (poolingType == ROI)
337 xstart = xstartROI + x0 * roiRatio;
338 xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
342 xstart = x0 * stride_w - pad_w;
343 xend = xstart + kernel_w;
345 xstart = max(xstart, 0);
346 xend = min(xend, inp_width);
347 if (xstart >= xend || ystart >= yend)
350 if (compMaxIdx && dstMaskData)
351 dstMaskData[x0] = -1;
355 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
359 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
360 v_float32x4 max_val1 = max_val0;
361 v_float32x4 max_idx0 = v_setall_f32(-1.f);
362 v_float32x4 max_idx1 = max_idx0;
363 int index0 = ystart * inp_width + xstart;
364 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
365 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
367 for (int y = ystart; y < yend; ++y)
369 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
371 const int index = y * inp_width + x;
372 v_float32x4 v0(srcData[index], srcData[index + stride_w],
373 srcData[index + stride_w*2], srcData[index + stride_w*3]);
374 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
375 srcData[index + stride_w*6], srcData[index + stride_w*7]);
376 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
377 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
378 max_val0 = v_max(max_val0, v0);
379 max_val1 = v_max(max_val1, v1);
384 v_store(dstData + x0, max_val0);
385 v_store(dstData + x0 + 4, max_val1);
388 v_store(dstMaskData + x0, max_idx0);
389 v_store(dstMaskData + x0 + 4, max_idx1);
395 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
396 v_float32x4 max_val1 = max_val0;
398 if( yend - ystart == kernel_h )
400 const float* srcData1 = srcData + ystart*inp_width + xstart;
402 for (int k = 0; k < kernel_w*kernel_h; k++)
404 int index = ofsptr[k];
405 v_float32x4 v0 = v_load(srcData1 + index);
406 v_float32x4 v1 = v_load(srcData1 + index + 4);
407 max_val0 = v_max(max_val0, v0);
408 max_val1 = v_max(max_val1, v1);
411 else if( stride_w == 2 )
412 for (int k = 0; k < kernel_w*kernel_h; k++)
414 int index = ofsptr[k];
415 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
416 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
417 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
418 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
419 max_val0 = v_max(max_val0, v0);
420 max_val1 = v_max(max_val1, v1);
424 for (int k = 0; k < kernel_w*kernel_h; k++)
426 int index = ofsptr[k];
427 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
428 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
429 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
430 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
431 max_val0 = v_max(max_val0, v0);
432 max_val1 = v_max(max_val1, v1);
437 for (int y = ystart; y < yend; ++y)
439 for (int x = xstart; x < xend; ++x)
441 const int index = y * inp_width + x;
442 v_float32x4 v0(srcData[index], srcData[index + stride_w],
443 srcData[index + stride_w*2], srcData[index + stride_w*3]);
444 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
445 srcData[index + stride_w*6], srcData[index + stride_w*7]);
446 max_val0 = v_max(max_val0, v0);
447 max_val1 = v_max(max_val1, v1);
451 v_store(dstData + x0, max_val0);
452 v_store(dstData + x0 + 4, max_val1);
459 float max_val = -FLT_MAX;
463 for (int y = ystart; y < yend; ++y)
464 for (int x = xstart; x < xend; ++x)
466 const int index = y * inp_width + x;
467 float val = srcData[index];
475 dstData[x0] = max_val;
477 dstMaskData[x0] = max_index;
481 for (int y = ystart; y < yend; ++y)
482 for (int x = xstart; x < xend; ++x)
484 const int index = y * inp_width + x;
485 float val = srcData[index];
486 max_val = std::max(max_val, val);
489 dstData[x0] = max_val;
495 for( ; x0 < x1; x0++ )
497 int xstart = x0 * stride_w - pad_w;
498 int xend = min(xstart + kernel_w, inp_width + pad_w);
499 int xdelta = xend - xstart;
500 xstart = max(xstart, 0);
501 xend = min(xend, inp_width);
502 float inv_kernel_area = 1.f/(ydelta*xdelta);
505 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
507 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
508 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
510 for (int y = ystart; y < yend; ++y)
512 for (int x = xstart; x < xend; ++x)
514 const int index = y * inp_width + x;
515 v_float32x4 v0(srcData[index], srcData[index + stride_w],
516 srcData[index + stride_w*2], srcData[index + stride_w*3]);
517 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
518 srcData[index + stride_w*6], srcData[index + stride_w*7]);
523 v_store(dstData + x0, sum_val0*ikarea);
524 v_store(dstData + x0 + 4, sum_val1*ikarea);
531 for (int y = ystart; y < yend; ++y)
532 for (int x = xstart; x < xend; ++x)
534 const int index = y * inp_width + x;
535 float val = srcData[index];
539 dstData[x0] = sum_val*inv_kernel_area;
547 void maxPooling(Mat &src, Mat &dst, Mat &mask)
549 const int nstripes = getNumThreads();
551 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
554 void avePooling(Mat &src, Mat &dst)
556 const int nstripes = getNumThreads();
558 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
561 void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
563 const int nstripes = getNumThreads();
565 PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
568 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
571 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
572 const int inWidth = inputBuffer.width();
573 const int inHeight = inputBuffer.height();
575 Halide::Var x("x"), y("y"), c("c"), n("n");
576 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
577 Halide::RDom r(0, kernel.width, 0, kernel.height);
579 if (pad.width || pad.height)
581 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
582 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
586 kx = min(x * stride.width + r.x, inWidth - 1);
587 ky = min(y * stride.height + r.y, inHeight - 1);
590 // Halide::argmax returns tuple (r.x, r.y, max).
591 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
593 // Compute offset from argmax in range [0, kernel_size).
594 Halide::Expr max_index;
595 if (pad.width || pad.height)
597 max_index = clamp(y * stride.height + res[1] - pad.height,
598 0, inHeight - 1) * inWidth +
599 clamp(x * stride.width + res[0] - pad.width,
604 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
605 min(x * stride.width + res[0], inWidth - 1);
607 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
608 return Ptr<BackendNode>(new HalideBackendNode(top));
609 #endif // HAVE_HALIDE
610 return Ptr<BackendNode>();
613 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
616 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
618 const int inW = inputBuffer.width(), inH = inputBuffer.height();
619 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
621 CV_Error(cv::Error::StsNotImplemented,
622 "Halide backend for average pooling with partial "
623 "kernels is not implemented");
626 const float norm = 1.0f / (kernel.width * kernel.height);
628 Halide::Var x("x"), y("y"), c("c"), n("n");
629 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
630 Halide::RDom r(0, kernel.width, 0, kernel.height);
631 top(x, y, c, n) = sum(
632 inputBuffer(x * stride.width + r.x,
633 y * stride.height + r.y, c, n)) * norm;
634 return Ptr<BackendNode>(new HalideBackendNode(top));
635 #endif // HAVE_HALIDE
636 return Ptr<BackendNode>();
639 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
640 const std::vector<Mat*> &inputs,
641 const std::vector<Mat> &outputs,
645 if (targetId != DNN_TARGET_CPU)
647 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
650 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
651 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
652 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
654 int outW, outH, outC, outN;
655 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
657 if (outW < 8 || outH < 8)
660 top.split(c, co, ci, 8)
661 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
666 top.fuse(y, c, tile).fuse(n, tile, tile)
675 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
676 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
680 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
681 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
685 #endif // HAVE_HALIDE
688 bool getMemoryShapes(const std::vector<MatShape> &inputs,
689 const int requiredOutputs,
690 std::vector<MatShape> &outputs,
691 std::vector<MatShape> &internals) const
693 CV_Assert(inputs.size() != 0);
694 Size in(inputs[0][3], inputs[0][2]), out;
701 else if (type == PoolingLayer::ROI)
703 out.height = pooledSize.height;
704 out.width = pooledSize.width;
706 else if (padMode.empty())
708 float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
709 float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
710 out.height = 1 + (ceilMode ? ceil(height) : floor(height));
711 out.width = 1 + (ceilMode ? ceil(width) : floor(width));
713 if (pad.height || pad.width)
715 // If we have padding, ensure that the last pooling starts strictly
716 // inside the image (instead of at the padding); otherwise clip the last.
717 if ((out.height - 1) * stride.height >= in.height + pad.height)
719 if ((out.width - 1) * stride.width >= in.width + pad.width)
721 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
722 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
727 getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
730 int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
733 CV_Assert(inputs.size() == 2);
734 dims[0] = inputs[1][0]; // Number of proposals;
736 outputs.assign(type == MAX ? 2 : 1, shape(dims));
740 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
741 const std::vector<MatShape> &outputs) const
743 (void)inputs; // suppress unused variable warning
746 for(int i = 0; i < outputs.size(); i++)
751 flops += total(outputs[i])*kernel.area();
755 flops += total(outputs[i])*(kernel.area() + 1);
762 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
764 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));