1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
57 //TODO: add ceil_mode param
58 class PoolingLayerImpl : public PoolingLayer
61 PoolingLayerImpl(const LayerParams& params)
63 type = PoolingLayer::MAX;
66 if (params.has("pool"))
68 String pool = params.get<String>("pool").toLowerCase();
70 type = PoolingLayer::MAX;
71 else if (pool == "ave")
72 type = PoolingLayer::AVE;
73 else if (pool == "stochastic")
74 type = PoolingLayer::STOCHASTIC;
76 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
79 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
80 pad.height, pad.width, stride.height, stride.width, padMode);
81 setParamsFrom(params);
84 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
86 CV_Assert(inputs.size() == 1);
88 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
89 out(outputs[0].size[3], outputs[0].size[2]);
96 getConvPoolPaddings(inp, out, kernel, stride, padMode, pad);
99 virtual bool supportBackend(int backendId)
101 return backendId == DNN_BACKEND_DEFAULT ||
102 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
103 (type == PoolingLayer::MAX ||
104 type == PoolingLayer::AVE && !pad.width && !pad.height);
107 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
110 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
112 for (size_t ii = 0; ii < inputs.size(); ii++)
117 maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
120 avePooling(*inputs[ii], outputs[ii]);
123 CV_Error(Error::StsNotImplemented, "Not implemented");
129 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
131 if (type == PoolingLayer::MAX)
132 return initMaxPoolingHalide(inputs);
133 else if (type == PoolingLayer::AVE)
134 return initAvePoolingHalide(inputs);
136 return Ptr<BackendNode>();
139 class PoolingInvoker : public ParallelLoopBody
144 Size kernel, stride, pad;
147 std::vector<int> ofsbuf;
152 static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
153 Size stride, Size pad, int poolingType,
154 bool computeMaxIdx, int nstripes)
156 CV_Assert(src.isContinuous() && dst.isContinuous() &&
157 src.type() == CV_32F && src.type() == dst.type() &&
158 src.dims == 4 && dst.dims == 4 &&
159 src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
160 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
170 p.nstripes = nstripes;
171 p.computeMaxIdx = computeMaxIdx;
172 p.poolingType = poolingType;
176 p.ofsbuf.resize(kernel.width*kernel.height);
177 for( int i = 0; i < kernel.height; i++ )
178 for( int j = 0; j < kernel.width; j++ )
179 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
182 parallel_for_(Range(0, nstripes), p, nstripes);
185 void operator()(const Range& r) const
187 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
188 int inp_width = src->size[3], inp_height = src->size[2];
189 size_t total = dst->total();
190 size_t stripeSize = (total + nstripes - 1)/nstripes;
191 size_t stripeStart = r.start*stripeSize;
192 size_t stripeEnd = std::min(r.end*stripeSize, total);
193 int kernel_w = kernel.width, kernel_h = kernel.height;
194 int pad_w = pad.width, pad_h = pad.height;
195 int stride_w = stride.width, stride_h = stride.height;
196 bool compMaxIdx = computeMaxIdx;
199 const int* ofsptr = &ofsbuf[0];
200 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
201 v_float32x4 ones = v_setall_f32(1.f);
202 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
205 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
208 int x0 = (int)(ofs % width);
210 int y0 = (int)(ofs % height);
212 int c = (int)(ofs % channels);
213 int n = (int)(ofs / channels);
214 int ystart = y0 * stride_h - pad_h;
215 int yend = min(ystart + kernel_h, inp_height + pad_h);
216 int ydelta = yend - ystart;
217 ystart = max(ystart, 0);
218 yend = min(yend, inp_height);
219 const float *srcData = src->ptr<float>(n, c);
220 float *dstData = dst->ptr<float>(n, c, y0);
221 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
223 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
227 if( poolingType == PoolingLayer::MAX )
228 for( ; x0 < x1; x0++ )
230 int xstart = x0 * stride_w - pad_w;
231 int xend = min(xstart + kernel_w, inp_width);
232 xstart = max(xstart, 0);
235 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
239 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
240 v_float32x4 max_val1 = max_val0;
241 v_float32x4 max_idx0 = v_setall_f32(-1.f);
242 v_float32x4 max_idx1 = max_idx0;
243 int index0 = ystart * inp_width + xstart;
244 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
245 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
247 for (int y = ystart; y < yend; ++y)
249 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
251 const int index = y * inp_width + x;
252 v_float32x4 v0(srcData[index], srcData[index + stride_w],
253 srcData[index + stride_w*2], srcData[index + stride_w*3]);
254 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
255 srcData[index + stride_w*6], srcData[index + stride_w*7]);
256 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
257 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
258 max_val0 = v_max(max_val0, v0);
259 max_val1 = v_max(max_val1, v1);
264 v_store(dstData + x0, max_val0);
265 v_store(dstData + x0 + 4, max_val1);
266 v_store(dstMaskData + x0, max_idx0);
267 v_store(dstMaskData + x0 + 4, max_idx1);
272 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
273 v_float32x4 max_val1 = max_val0;
275 if( yend - ystart == kernel_h )
277 const float* srcData1 = srcData + ystart*inp_width + xstart;
279 for (int k = 0; k < kernel_w*kernel_h; k++)
281 int index = ofsptr[k];
282 v_float32x4 v0 = v_load(srcData1 + index);
283 v_float32x4 v1 = v_load(srcData1 + index + 4);
284 max_val0 = v_max(max_val0, v0);
285 max_val1 = v_max(max_val1, v1);
288 else if( stride_w == 2 )
289 for (int k = 0; k < kernel_w*kernel_h; k++)
291 int index = ofsptr[k];
292 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
293 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
294 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
295 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
296 max_val0 = v_max(max_val0, v0);
297 max_val1 = v_max(max_val1, v1);
301 for (int k = 0; k < kernel_w*kernel_h; k++)
303 int index = ofsptr[k];
304 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
305 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
306 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
307 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
308 max_val0 = v_max(max_val0, v0);
309 max_val1 = v_max(max_val1, v1);
314 for (int y = ystart; y < yend; ++y)
316 for (int x = xstart; x < xend; ++x)
318 const int index = y * inp_width + x;
319 v_float32x4 v0(srcData[index], srcData[index + stride_w],
320 srcData[index + stride_w*2], srcData[index + stride_w*3]);
321 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
322 srcData[index + stride_w*6], srcData[index + stride_w*7]);
323 max_val0 = v_max(max_val0, v0);
324 max_val1 = v_max(max_val1, v1);
328 v_store(dstData + x0, max_val0);
329 v_store(dstData + x0 + 4, max_val1);
336 float max_val = -FLT_MAX;
340 for (int y = ystart; y < yend; ++y)
341 for (int x = xstart; x < xend; ++x)
343 const int index = y * inp_width + x;
344 float val = srcData[index];
352 dstData[x0] = max_val;
353 dstMaskData[x0] = max_index;
357 for (int y = ystart; y < yend; ++y)
358 for (int x = xstart; x < xend; ++x)
360 const int index = y * inp_width + x;
361 float val = srcData[index];
362 max_val = std::max(max_val, val);
365 dstData[x0] = max_val;
371 for( ; x0 < x1; x0++ )
373 int xstart = x0 * stride_w - pad_w;
374 int xend = min(xstart + kernel_w, inp_width + pad_w);
375 int xdelta = xend - xstart;
376 xstart = max(xstart, 0);
377 xend = min(xend, inp_width);
378 float inv_kernel_area = 1.f/(ydelta*xdelta);
381 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
383 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
384 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
386 for (int y = ystart; y < yend; ++y)
388 for (int x = xstart; x < xend; ++x)
390 const int index = y * inp_width + x;
391 v_float32x4 v0(srcData[index], srcData[index + stride_w],
392 srcData[index + stride_w*2], srcData[index + stride_w*3]);
393 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
394 srcData[index + stride_w*6], srcData[index + stride_w*7]);
399 v_store(dstData + x0, sum_val0*ikarea);
400 v_store(dstData + x0 + 4, sum_val1*ikarea);
407 for (int y = ystart; y < yend; ++y)
408 for (int x = xstart; x < xend; ++x)
410 const int index = y * inp_width + x;
411 float val = srcData[index];
415 dstData[x0] = sum_val*inv_kernel_area;
423 void maxPooling(Mat &src, Mat &dst, Mat &mask)
425 const int nstripes = getNumThreads();
426 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
429 void avePooling(Mat &src, Mat &dst)
431 const int nstripes = getNumThreads();
433 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
436 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
439 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
440 const int inWidth = inputBuffer.width();
441 const int inHeight = inputBuffer.height();
443 Halide::Var x("x"), y("y"), c("c"), n("n");
444 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
445 Halide::RDom r(0, kernel.width, 0, kernel.height);
447 if (pad.width || pad.height)
449 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
450 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
454 kx = min(x * stride.width + r.x, inWidth - 1);
455 ky = min(y * stride.height + r.y, inHeight - 1);
458 // Halide::argmax returns tuple (r.x, r.y, max).
459 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
461 // Compute offset from argmax in range [0, kernel_size).
462 Halide::Expr max_index;
463 if (pad.width || pad.height)
465 max_index = clamp(y * stride.height + res[1] - pad.height,
466 0, inHeight - 1) * inWidth +
467 clamp(x * stride.width + res[0] - pad.width,
472 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
473 min(x * stride.width + res[0], inWidth - 1);
475 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
476 return Ptr<BackendNode>(new HalideBackendNode(top));
477 #endif // HAVE_HALIDE
478 return Ptr<BackendNode>();
481 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
484 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
486 const int inW = inputBuffer.width(), inH = inputBuffer.height();
487 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
489 CV_Error(cv::Error::StsNotImplemented,
490 "Halide backend for average pooling with partial "
491 "kernels is not implemented");
494 const float norm = 1.0f / (kernel.width * kernel.height);
496 Halide::Var x("x"), y("y"), c("c"), n("n");
497 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
498 Halide::RDom r(0, kernel.width, 0, kernel.height);
499 top(x, y, c, n) = sum(
500 inputBuffer(x * stride.width + r.x,
501 y * stride.height + r.y, c, n)) * norm;
502 return Ptr<BackendNode>(new HalideBackendNode(top));
503 #endif // HAVE_HALIDE
504 return Ptr<BackendNode>();
507 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
508 const std::vector<Mat*> &inputs,
509 const std::vector<Mat> &outputs,
513 if (targetId != DNN_TARGET_CPU)
515 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
518 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
519 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
520 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
522 int outW, outH, outC, outN;
523 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
525 if (outW < 8 || outH < 8)
528 top.split(c, co, ci, 8)
529 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
534 top.fuse(y, c, tile).fuse(n, tile, tile)
543 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
544 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
548 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
549 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
553 #endif // HAVE_HALIDE
556 bool getMemoryShapes(const std::vector<MatShape> &inputs,
557 const int requiredOutputs,
558 std::vector<MatShape> &outputs,
559 std::vector<MatShape> &internals) const
561 CV_Assert(inputs.size() != 0);
562 Size in(inputs[0][3], inputs[0][2]), out;
569 else if (padMode.empty())
571 //Yeah, something strange Caffe scheme-)
572 out.height = static_cast<int>(ceil(static_cast<float>(in.height + 2 * pad.height -
573 kernel.height) / stride.height)) + 1;
574 out.width = static_cast<int>(ceil(static_cast<float>(in.width + 2 * pad.width -
575 kernel.width) / stride.width)) + 1;
577 if (pad.height || pad.width)
579 // If we have padding, ensure that the last pooling starts strictly
580 // inside the image (instead of at the padding); otherwise clip the last.
581 if ((out.height - 1) * stride.height >= in.height + pad.height)
583 if ((out.width - 1) * stride.width >= in.width + pad.width)
585 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
586 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
591 getConvPoolOutParams(in, kernel, stride,
595 outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
596 for (size_t i = 0; i < inputs.size(); i++)
598 size_t index = type == MAX ? 2*i : i;
599 int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
600 outputs[index] = shape(dims);
603 outputs[index + 1] = shape(dims);
609 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
610 const std::vector<MatShape> &outputs) const
612 (void)inputs; // suppress unused variable warning
615 for(int i = 0; i < outputs.size(); i++)
620 flops += total(outputs[i])*kernel.area();
624 flops += total(outputs[i])*(kernel.area() + 1);
631 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
633 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));