1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
53 using namespace cv::dnn::ocl4dnn;
61 class PoolingLayerImpl : public PoolingLayer
64 PoolingLayerImpl(const LayerParams& params)
66 type = PoolingLayer::MAX;
69 if (params.has("pool"))
71 String pool = params.get<String>("pool").toLowerCase();
73 type = PoolingLayer::MAX;
74 else if (pool == "ave")
75 type = PoolingLayer::AVE;
76 else if (pool == "stochastic")
77 type = PoolingLayer::STOCHASTIC;
79 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
82 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
83 pad.height, pad.width, stride.height, stride.width, padMode);
84 setParamsFrom(params);
85 ceilMode = params.get<bool>("ceil_mode", true);
89 Ptr<OCL4DNNPool<float> > poolOp;
92 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
94 CV_Assert(inputs.size() == 1);
96 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
97 out(outputs[0].size[3], outputs[0].size[2]);
104 getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
107 virtual bool supportBackend(int backendId)
109 return backendId == DNN_BACKEND_DEFAULT ||
110 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
111 (type == PoolingLayer::MAX ||
112 type == PoolingLayer::AVE && !pad.width && !pad.height);
116 bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
120 OCL4DNNPoolConfig config;
122 config.in_shape = shape(*inputs[0]);
123 config.out_shape = shape(outputs[0]);
124 config.kernel = kernel;
126 config.stride = stride;
127 config.channels = inputs[0]->size[1];
128 config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
129 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
130 LIBDNN_POOLING_METHOD_STO);
131 poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
134 for (size_t ii = 0; ii < inputs.size(); ii++)
136 UMat inpMat, outMat, maskMat;
138 inpMat = inputs[ii]->getUMat(ACCESS_READ);
142 outMat = outputs[2 * ii].getUMat(ACCESS_WRITE);
143 maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE);
145 outMat = outputs[ii].getUMat(ACCESS_WRITE);
149 CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
151 if (!poolOp->Forward(inpMat, outMat, maskMat))
159 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
162 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
164 CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
165 OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
166 forward_ocl(inputs, outputs, internals))
168 for (size_t ii = 0; ii < inputs.size(); ii++)
173 maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
176 avePooling(*inputs[ii], outputs[ii]);
179 CV_Error(Error::StsNotImplemented, "Not implemented");
185 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
187 if (type == PoolingLayer::MAX)
188 return initMaxPoolingHalide(inputs);
189 else if (type == PoolingLayer::AVE)
190 return initAvePoolingHalide(inputs);
192 return Ptr<BackendNode>();
195 class PoolingInvoker : public ParallelLoopBody
200 Size kernel, stride, pad;
203 std::vector<int> ofsbuf;
206 PoolingInvoker() : src(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX) {}
208 static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
209 Size stride, Size pad, int poolingType,
210 bool computeMaxIdx, int nstripes)
212 CV_Assert(src.isContinuous() && dst.isContinuous() &&
213 src.type() == CV_32F && src.type() == dst.type() &&
214 src.dims == 4 && dst.dims == 4 &&
215 src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
216 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
226 p.nstripes = nstripes;
227 p.computeMaxIdx = computeMaxIdx;
228 p.poolingType = poolingType;
232 p.ofsbuf.resize(kernel.width*kernel.height);
233 for( int i = 0; i < kernel.height; i++ )
234 for( int j = 0; j < kernel.width; j++ )
235 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
238 parallel_for_(Range(0, nstripes), p, nstripes);
241 void operator()(const Range& r) const
243 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
244 int inp_width = src->size[3], inp_height = src->size[2];
245 size_t total = dst->total();
246 size_t stripeSize = (total + nstripes - 1)/nstripes;
247 size_t stripeStart = r.start*stripeSize;
248 size_t stripeEnd = std::min(r.end*stripeSize, total);
249 int kernel_w = kernel.width, kernel_h = kernel.height;
250 int pad_w = pad.width, pad_h = pad.height;
251 int stride_w = stride.width, stride_h = stride.height;
252 bool compMaxIdx = computeMaxIdx;
255 const int* ofsptr = &ofsbuf[0];
256 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
257 v_float32x4 ones = v_setall_f32(1.f);
258 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
261 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
264 int x0 = (int)(ofs % width);
266 int y0 = (int)(ofs % height);
268 int c = (int)(ofs % channels);
269 int n = (int)(ofs / channels);
270 int ystart = y0 * stride_h - pad_h;
271 int yend = min(ystart + kernel_h, inp_height + pad_h);
272 int ydelta = yend - ystart;
273 ystart = max(ystart, 0);
274 yend = min(yend, inp_height);
275 const float *srcData = src->ptr<float>(n, c);
276 float *dstData = dst->ptr<float>(n, c, y0);
277 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
279 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
283 if( poolingType == PoolingLayer::MAX )
284 for( ; x0 < x1; x0++ )
286 int xstart = x0 * stride_w - pad_w;
287 int xend = min(xstart + kernel_w, inp_width);
288 xstart = max(xstart, 0);
291 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
295 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
296 v_float32x4 max_val1 = max_val0;
297 v_float32x4 max_idx0 = v_setall_f32(-1.f);
298 v_float32x4 max_idx1 = max_idx0;
299 int index0 = ystart * inp_width + xstart;
300 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
301 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
303 for (int y = ystart; y < yend; ++y)
305 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
307 const int index = y * inp_width + x;
308 v_float32x4 v0(srcData[index], srcData[index + stride_w],
309 srcData[index + stride_w*2], srcData[index + stride_w*3]);
310 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
311 srcData[index + stride_w*6], srcData[index + stride_w*7]);
312 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
313 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
314 max_val0 = v_max(max_val0, v0);
315 max_val1 = v_max(max_val1, v1);
320 v_store(dstData + x0, max_val0);
321 v_store(dstData + x0 + 4, max_val1);
324 v_store(dstMaskData + x0, max_idx0);
325 v_store(dstMaskData + x0 + 4, max_idx1);
331 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
332 v_float32x4 max_val1 = max_val0;
334 if( yend - ystart == kernel_h )
336 const float* srcData1 = srcData + ystart*inp_width + xstart;
338 for (int k = 0; k < kernel_w*kernel_h; k++)
340 int index = ofsptr[k];
341 v_float32x4 v0 = v_load(srcData1 + index);
342 v_float32x4 v1 = v_load(srcData1 + index + 4);
343 max_val0 = v_max(max_val0, v0);
344 max_val1 = v_max(max_val1, v1);
347 else if( stride_w == 2 )
348 for (int k = 0; k < kernel_w*kernel_h; k++)
350 int index = ofsptr[k];
351 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
352 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
353 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
354 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
355 max_val0 = v_max(max_val0, v0);
356 max_val1 = v_max(max_val1, v1);
360 for (int k = 0; k < kernel_w*kernel_h; k++)
362 int index = ofsptr[k];
363 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
364 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
365 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
366 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
367 max_val0 = v_max(max_val0, v0);
368 max_val1 = v_max(max_val1, v1);
373 for (int y = ystart; y < yend; ++y)
375 for (int x = xstart; x < xend; ++x)
377 const int index = y * inp_width + x;
378 v_float32x4 v0(srcData[index], srcData[index + stride_w],
379 srcData[index + stride_w*2], srcData[index + stride_w*3]);
380 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
381 srcData[index + stride_w*6], srcData[index + stride_w*7]);
382 max_val0 = v_max(max_val0, v0);
383 max_val1 = v_max(max_val1, v1);
387 v_store(dstData + x0, max_val0);
388 v_store(dstData + x0 + 4, max_val1);
395 float max_val = -FLT_MAX;
399 for (int y = ystart; y < yend; ++y)
400 for (int x = xstart; x < xend; ++x)
402 const int index = y * inp_width + x;
403 float val = srcData[index];
411 dstData[x0] = max_val;
413 dstMaskData[x0] = max_index;
417 for (int y = ystart; y < yend; ++y)
418 for (int x = xstart; x < xend; ++x)
420 const int index = y * inp_width + x;
421 float val = srcData[index];
422 max_val = std::max(max_val, val);
425 dstData[x0] = max_val;
431 for( ; x0 < x1; x0++ )
433 int xstart = x0 * stride_w - pad_w;
434 int xend = min(xstart + kernel_w, inp_width + pad_w);
435 int xdelta = xend - xstart;
436 xstart = max(xstart, 0);
437 xend = min(xend, inp_width);
438 float inv_kernel_area = 1.f/(ydelta*xdelta);
441 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
443 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
444 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
446 for (int y = ystart; y < yend; ++y)
448 for (int x = xstart; x < xend; ++x)
450 const int index = y * inp_width + x;
451 v_float32x4 v0(srcData[index], srcData[index + stride_w],
452 srcData[index + stride_w*2], srcData[index + stride_w*3]);
453 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
454 srcData[index + stride_w*6], srcData[index + stride_w*7]);
459 v_store(dstData + x0, sum_val0*ikarea);
460 v_store(dstData + x0 + 4, sum_val1*ikarea);
467 for (int y = ystart; y < yend; ++y)
468 for (int x = xstart; x < xend; ++x)
470 const int index = y * inp_width + x;
471 float val = srcData[index];
475 dstData[x0] = sum_val*inv_kernel_area;
483 void maxPooling(Mat &src, Mat &dst, Mat &mask)
485 const int nstripes = getNumThreads();
486 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
489 void avePooling(Mat &src, Mat &dst)
491 const int nstripes = getNumThreads();
493 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
496 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
499 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
500 const int inWidth = inputBuffer.width();
501 const int inHeight = inputBuffer.height();
503 Halide::Var x("x"), y("y"), c("c"), n("n");
504 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
505 Halide::RDom r(0, kernel.width, 0, kernel.height);
507 if (pad.width || pad.height)
509 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
510 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
514 kx = min(x * stride.width + r.x, inWidth - 1);
515 ky = min(y * stride.height + r.y, inHeight - 1);
518 // Halide::argmax returns tuple (r.x, r.y, max).
519 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
521 // Compute offset from argmax in range [0, kernel_size).
522 Halide::Expr max_index;
523 if (pad.width || pad.height)
525 max_index = clamp(y * stride.height + res[1] - pad.height,
526 0, inHeight - 1) * inWidth +
527 clamp(x * stride.width + res[0] - pad.width,
532 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
533 min(x * stride.width + res[0], inWidth - 1);
535 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
536 return Ptr<BackendNode>(new HalideBackendNode(top));
537 #endif // HAVE_HALIDE
538 return Ptr<BackendNode>();
541 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
544 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
546 const int inW = inputBuffer.width(), inH = inputBuffer.height();
547 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
549 CV_Error(cv::Error::StsNotImplemented,
550 "Halide backend for average pooling with partial "
551 "kernels is not implemented");
554 const float norm = 1.0f / (kernel.width * kernel.height);
556 Halide::Var x("x"), y("y"), c("c"), n("n");
557 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
558 Halide::RDom r(0, kernel.width, 0, kernel.height);
559 top(x, y, c, n) = sum(
560 inputBuffer(x * stride.width + r.x,
561 y * stride.height + r.y, c, n)) * norm;
562 return Ptr<BackendNode>(new HalideBackendNode(top));
563 #endif // HAVE_HALIDE
564 return Ptr<BackendNode>();
567 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
568 const std::vector<Mat*> &inputs,
569 const std::vector<Mat> &outputs,
573 if (targetId != DNN_TARGET_CPU)
575 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
578 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
579 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
580 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
582 int outW, outH, outC, outN;
583 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
585 if (outW < 8 || outH < 8)
588 top.split(c, co, ci, 8)
589 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
594 top.fuse(y, c, tile).fuse(n, tile, tile)
603 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
604 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
608 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
609 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
613 #endif // HAVE_HALIDE
616 bool getMemoryShapes(const std::vector<MatShape> &inputs,
617 const int requiredOutputs,
618 std::vector<MatShape> &outputs,
619 std::vector<MatShape> &internals) const
621 CV_Assert(inputs.size() != 0);
622 Size in(inputs[0][3], inputs[0][2]), out;
629 else if (padMode.empty())
631 float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
632 float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
633 out.height = 1 + (ceilMode ? ceil(height) : floor(height));
634 out.width = 1 + (ceilMode ? ceil(width) : floor(width));
636 if (pad.height || pad.width)
638 // If we have padding, ensure that the last pooling starts strictly
639 // inside the image (instead of at the padding); otherwise clip the last.
640 if ((out.height - 1) * stride.height >= in.height + pad.height)
642 if ((out.width - 1) * stride.width >= in.width + pad.width)
644 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
645 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
650 getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
653 outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
654 for (size_t i = 0; i < inputs.size(); i++)
656 size_t index = type == MAX ? 2*i : i;
657 int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
658 outputs[index] = shape(dims);
661 outputs[index + 1] = shape(dims);
667 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
668 const std::vector<MatShape> &outputs) const
670 (void)inputs; // suppress unused variable warning
673 for(int i = 0; i < outputs.size(); i++)
678 flops += total(outputs[i])*kernel.area();
682 flops += total(outputs[i])*(kernel.area() + 1);
689 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
691 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));