Merge pull request #16472 from l-bat:cp_vton
[platform/upstream/opencv.git] / modules / dnn / src / onnx / onnx_importer.cpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 // Copyright (C) 2018, Intel Corporation, all rights reserved.
6 // Third party copyrights are property of their respective owners.
7
8 #include "../precomp.hpp"
9 #include <opencv2/dnn/shape_utils.hpp>
10
11 #ifdef HAVE_PROTOBUF
12
13 #include <iostream>
14 #include <fstream>
15 #include <string>
16 #include <limits>
17 #include <algorithm>
18
19
20 #if defined(__GNUC__) && __GNUC__ >= 5
21 #pragma GCC diagnostic push
22 #pragma GCC diagnostic ignored "-Wsuggest-override"
23 #endif
24 #include "opencv-onnx.pb.h"
25 #if defined(__GNUC__) && __GNUC__ >= 5
26 #pragma GCC diagnostic pop
27 #endif
28
29 #include "onnx_graph_simplifier.hpp"
30
31 namespace cv {
32 namespace dnn {
33 CV__DNN_EXPERIMENTAL_NS_BEGIN
34
35
36 class ONNXImporter
37 {
38     opencv_onnx::ModelProto model_proto;
39     struct LayerInfo {
40         int layerId;
41         int outputId;
42         LayerInfo(int _layerId, int _outputId) : layerId(_layerId), outputId(_outputId) {}
43     };
44
45     std::map<std::string, Mat> getGraphTensors(
46                                     const opencv_onnx::GraphProto& graph_proto);
47     Mat getBlob(const opencv_onnx::NodeProto& node_proto, const std::map<std::string, Mat>& constBlobs, int index);
48
49     LayerParams getLayerParams(const opencv_onnx::NodeProto& node_proto);
50     bool isCeilMode(const LayerParams& layerParams);
51
52 public:
53
54     ONNXImporter(const char *onnxFile)
55     {
56         std::fstream input(onnxFile, std::ios::in | std::ios::binary);
57
58         if (!model_proto.ParseFromIstream(&input))
59             CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model");
60     }
61
62     ONNXImporter(const char* buffer, size_t sizeBuffer)
63     {
64         struct _Buf : public std::streambuf
65         {
66             _Buf(const char* buffer, size_t sizeBuffer)
67             {
68                 char* p = const_cast<char*>(buffer);
69                 setg(p, p, p + sizeBuffer);
70             }
71         };
72
73         _Buf buf(buffer, sizeBuffer);
74         std::istream input(&buf);
75
76         if (!model_proto.ParseFromIstream(&input))
77             CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model from in-memory byte array.");
78     }
79
80     void populateNet(Net dstNet);
81 };
82
83 inline void replaceLayerParam(LayerParams& layerParams, const String& oldKey, const String& newKey)
84 {
85     if (layerParams.has(oldKey)) {
86         layerParams.set(newKey, layerParams.get(oldKey));
87         layerParams.erase(oldKey);
88     }
89 }
90
91 void releaseONNXTensor(opencv_onnx::TensorProto& tensor_proto)
92 {
93     if (!tensor_proto.raw_data().empty()) {
94         delete tensor_proto.release_raw_data();
95     }
96 }
97
98 template<typename T1, typename T2>
99 void convertInt64ToInt32(const T1& src, T2& dst, int size)
100 {
101     for (int i = 0; i < size; i++) {
102         if (src[i] < std::numeric_limits<int32_t>::min() || src[i] > std::numeric_limits<int32_t>::max()) {
103             CV_Error(Error::StsOutOfRange, "Input is out of OpenCV 32S range");
104         }
105         dst[i] = saturate_cast<int32_t>(src[i]);
106     }
107 }
108
109 Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto)
110 {
111     CV_Assert(!tensor_proto.raw_data().empty() || !tensor_proto.float_data().empty()
112                     || !tensor_proto.double_data().empty() || !tensor_proto.int64_data().empty());
113
114     opencv_onnx::TensorProto_DataType datatype = tensor_proto.data_type();
115     Mat blob;
116     std::vector<int> sizes;
117     for (int i = 0; i < tensor_proto.dims_size(); i++) {
118             sizes.push_back(tensor_proto.dims(i));
119     }
120     if (sizes.empty())
121         sizes.assign(1, 1);
122     if (datatype == opencv_onnx::TensorProto_DataType_FLOAT) {
123
124         if (!tensor_proto.float_data().empty()) {
125             const ::google::protobuf::RepeatedField<float> field = tensor_proto.float_data();
126             Mat(sizes, CV_32FC1, (void*)field.data()).copyTo(blob);
127         }
128         else {
129             char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
130             Mat(sizes, CV_32FC1, val).copyTo(blob);
131         }
132     }
133     else if (datatype == opencv_onnx::TensorProto_DataType_DOUBLE)
134     {
135         const ::google::protobuf::RepeatedField<double> field = tensor_proto.double_data();
136         CV_Assert(!field.empty());
137         Mat(sizes, CV_64FC1, (void*)field.data()).convertTo(blob, CV_32FC1);
138     }
139     else if (datatype == opencv_onnx::TensorProto_DataType_INT64)
140     {
141         blob.create(sizes, CV_32SC1);
142         int32_t* dst = reinterpret_cast<int32_t*>(blob.data);
143
144         if (!tensor_proto.int64_data().empty()) {
145             ::google::protobuf::RepeatedField< ::google::protobuf::int64> src = tensor_proto.int64_data();
146             convertInt64ToInt32(src, dst, blob.total());
147         }
148         else
149         {
150             const char* val = tensor_proto.raw_data().c_str();
151 #if CV_STRONG_ALIGNMENT
152             // Aligned pointer is required: https://github.com/opencv/opencv/issues/16373
153             // this doesn't work: typedef int64_t CV_DECL_ALIGNED(1) unaligned_int64_t;
154             AutoBuffer<int64_t, 16> aligned_val;
155             if (!isAligned<sizeof(int64_t)>(val))
156             {
157                 size_t sz = tensor_proto.raw_data().size();
158                 aligned_val.allocate(divUp(sz, sizeof(int64_t)));
159                 memcpy(aligned_val.data(), val, sz);
160                 val = (const char*)aligned_val.data();
161             }
162 #endif
163             const int64_t* src = reinterpret_cast<const int64_t*>(val);
164             convertInt64ToInt32(src, dst, blob.total());
165         }
166     }
167     else
168         CV_Error(Error::StsUnsupportedFormat, "Unsupported data type: " +
169                         opencv_onnx::TensorProto_DataType_Name(datatype));
170     if (tensor_proto.dims_size() == 0)
171         blob.dims = 1;  // To force 1-dimensional cv::Mat for scalars.
172     return blob;
173 }
174
175 void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
176               std::vector<Mat>& outputs)
177 {
178     Ptr<Layer> layer = LayerFactory::createLayerInstance(params.type, params);
179     CV_Assert((bool)layer);
180
181     std::vector<MatShape> inpShapes(inputs.size());
182     int ddepth = CV_32F;
183     for (size_t i = 0; i < inputs.size(); ++i)
184     {
185         inpShapes[i] = shape(inputs[i]);
186         if (i > 0 && ddepth != inputs[i].depth())
187             CV_Error(Error::StsNotImplemented, "Mixed input data types.");
188         ddepth = inputs[i].depth();
189     }
190
191     std::vector<MatShape> outShapes, internalShapes;
192     layer->getMemoryShapes(inpShapes, 0, outShapes, internalShapes);
193
194     std::vector<Mat> internals(internalShapes.size());
195     outputs.resize(outShapes.size());
196     for (size_t i = 0; i < outShapes.size(); ++i)
197         outputs[i].create(outShapes[i], ddepth);
198     for (size_t i = 0; i < internalShapes.size(); ++i)
199         internals[i].create(internalShapes[i], ddepth);
200
201     layer->finalize(inputs, outputs);
202     layer->forward(inputs, outputs, internals);
203 }
204
205 std::map<std::string, Mat> ONNXImporter::getGraphTensors(
206                                         const opencv_onnx::GraphProto& graph_proto)
207 {
208   opencv_onnx::TensorProto tensor_proto;
209   std::map<std::string, Mat> layers_weights;
210
211   for (int i = 0; i < graph_proto.initializer_size(); i++)
212   {
213     tensor_proto = graph_proto.initializer(i);
214     Mat mat = getMatFromTensor(tensor_proto);
215     releaseONNXTensor(tensor_proto);
216     layers_weights.insert(std::make_pair(tensor_proto.name(), mat));
217   }
218   return layers_weights;
219 }
220
221 static DictValue parse(const ::google::protobuf::RepeatedField< ::google::protobuf::int64>& src) {
222     std::vector<int32_t> dst(src.size());
223     convertInt64ToInt32(src, dst, src.size());
224     return DictValue::arrayInt(&dst[0], src.size());
225 }
226
227 LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_proto)
228 {
229     LayerParams lp;
230     for(int i = 0; i < node_proto.attribute_size(); i++)
231     {
232         opencv_onnx::AttributeProto attribute_proto = node_proto.attribute(i);
233         std::string attribute_name = attribute_proto.name();
234
235         if(attribute_name == "kernel_shape")
236         {
237             CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
238             lp.set("kernel_size", parse(attribute_proto.ints()));
239         }
240         else if(attribute_name == "strides")
241         {
242             CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
243             lp.set("stride", parse(attribute_proto.ints()));
244         }
245         else if(attribute_name == "pads")
246         {
247             if (node_proto.op_type() == "Pad")
248             {
249                 // Padding layer.
250                 // Paddings are in order begin0, begin1, .. beginN, end0, end1, ..., endN.
251                 // We need to shuffle it to begin0, end0, begin1, end1, ...
252                 CV_Assert(attribute_proto.ints_size() % 2 == 0);
253                 const int dims = attribute_proto.ints_size() / 2;
254                 std::vector<int32_t> paddings;
255                 paddings.reserve(attribute_proto.ints_size());
256                 for (int i = 0; i < dims; ++i)
257                 {
258                     paddings.push_back(attribute_proto.ints(i));
259                     paddings.push_back(attribute_proto.ints(dims + i));
260                 }
261                 lp.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size()));
262             }
263             else
264             {
265                 // Convolution or pooling.
266                 CV_Assert(attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
267                 lp.set("pad", parse(attribute_proto.ints()));
268             }
269         }
270         else if(attribute_name == "auto_pad")
271         {
272             if (attribute_proto.s() == "SAME_UPPER" || attribute_proto.s() == "SAME_LOWER") {
273                 lp.set("pad_mode",  "SAME");
274             }
275             else if (attribute_proto.s() == "VALID") {
276                 lp.set("pad_mode", "VALID");
277             }
278         }
279         else if(attribute_name == "dilations")
280         {
281             CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
282             lp.set("dilation", parse(attribute_proto.ints()));
283         }
284         else if (attribute_proto.has_i())
285         {
286             ::google::protobuf::int64 src = attribute_proto.i();
287             if (src < std::numeric_limits<int32_t>::min() || src > std::numeric_limits<int32_t>::max())
288                 CV_Error(Error::StsOutOfRange, "Input is out of OpenCV 32S range");
289             else
290                 lp.set(attribute_name, saturate_cast<int32_t>(src));
291         }
292         else if (attribute_proto.has_f())
293         {
294             lp.set(attribute_name, attribute_proto.f());
295         }
296         else if (attribute_proto.has_s())
297         {
298             lp.set(attribute_name, attribute_proto.s());
299         }
300         else if (attribute_proto.floats_size() > 0)
301         {
302             lp.set(attribute_name, DictValue::arrayReal(
303                 attribute_proto.floats().data(), attribute_proto.floats_size()));
304         }
305         else if (attribute_proto.ints_size() > 0)
306         {
307             lp.set(attribute_proto.name(), parse(attribute_proto.ints()));
308         }
309         else if (attribute_proto.has_t())
310         {
311             opencv_onnx::TensorProto tensor = attribute_proto.t();
312             Mat blob = getMatFromTensor(tensor);
313             lp.blobs.push_back(blob);
314         }
315         else if (attribute_proto.has_g() || attribute_proto.strings_size() > 0 ||
316                     attribute_proto.tensors_size() > 0 || attribute_proto.graphs_size() > 0)
317         {
318                 CV_Error(Error::StsNotImplemented, "Unexpected attribute type");
319         }
320         else
321             CV_Error(Error::StsNotImplemented, "Unsupported attribute type");
322     }
323     return lp;
324 }
325
326 Mat ONNXImporter::getBlob(const opencv_onnx::NodeProto& node_proto,
327                     const std::map<std::string, Mat>& constBlobs, int index)
328 {
329     CV_Assert(index < node_proto.input_size());
330     std::map<std::string, Mat>::const_iterator constBlob;
331     constBlob = constBlobs.find(node_proto.input(index));
332     if (constBlob == constBlobs.end()) {
333         CV_Error(Error::StsObjectNotFound,
334              "Blob " + node_proto.input(index) + " not found in const blobs");
335     }
336     return constBlob->second;
337 }
338
339 void ONNXImporter::populateNet(Net dstNet)
340 {
341     CV_Assert(model_proto.has_graph());
342     opencv_onnx::GraphProto graph_proto = model_proto.graph();
343
344     simplifySubgraphs(graph_proto);
345
346     std::map<std::string, Mat> constBlobs = getGraphTensors(graph_proto);
347     // List of internal blobs shapes.
348     std::map<std::string, MatShape> outShapes;
349     // Add all the inputs shapes. It includes as constant blobs as network's inputs shapes.
350     for (int i = 0; i < graph_proto.input_size(); ++i)
351     {
352         opencv_onnx::ValueInfoProto valueInfoProto = graph_proto.input(i);
353         CV_Assert(valueInfoProto.has_type());
354         opencv_onnx::TypeProto typeProto = valueInfoProto.type();
355         CV_Assert(typeProto.has_tensor_type());
356         opencv_onnx::TypeProto::Tensor tensor = typeProto.tensor_type();
357         CV_Assert(tensor.has_shape());
358         opencv_onnx::TensorShapeProto tensorShape = tensor.shape();
359
360         MatShape inpShape(tensorShape.dim_size());
361         for (int j = 0; j < inpShape.size(); ++j)
362         {
363             inpShape[j] = tensorShape.dim(j).dim_value();
364         }
365         outShapes[valueInfoProto.name()] = inpShape;
366     }
367
368     std::string framework_name;
369     if (model_proto.has_producer_name()) {
370         framework_name = model_proto.producer_name();
371     }
372
373     // create map with network inputs (without const blobs)
374     std::map<std::string, LayerInfo> layer_id;
375     std::map<std::string, LayerInfo>::iterator layerId;
376     std::map<std::string, MatShape>::iterator shapeIt;
377     // fill map: push layer name, layer id and output id
378     std::vector<String> netInputs;
379     for (int j = 0; j < graph_proto.input_size(); j++)
380     {
381         const std::string& name = graph_proto.input(j).name();
382         if (constBlobs.find(name) == constBlobs.end()) {
383             netInputs.push_back(name);
384             layer_id.insert(std::make_pair(name, LayerInfo(0, netInputs.size() - 1)));
385         }
386     }
387     dstNet.setInputsNames(netInputs);
388
389     int layersSize = graph_proto.node_size();
390     LayerParams layerParams;
391     opencv_onnx::NodeProto node_proto;
392
393     for(int li = 0; li < layersSize; li++)
394     {
395         node_proto = graph_proto.node(li);
396         layerParams = getLayerParams(node_proto);
397         CV_Assert(node_proto.output_size() >= 1);
398         layerParams.name = node_proto.output(0);
399
400         std::string layer_type = node_proto.op_type();
401         layerParams.type = layer_type;
402
403
404         if (layer_type == "MaxPool")
405         {
406             layerParams.type = "Pooling";
407             layerParams.set("pool", "MAX");
408             layerParams.set("ceil_mode", layerParams.has("pad_mode"));
409         }
410         else if (layer_type == "AveragePool")
411         {
412             layerParams.type = "Pooling";
413             layerParams.set("pool", "AVE");
414             layerParams.set("ceil_mode", layerParams.has("pad_mode"));
415             layerParams.set("ave_pool_padded_area", framework_name == "pytorch");
416         }
417         else if (layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool" || layer_type == "ReduceMean")
418         {
419             CV_Assert(node_proto.input_size() == 1);
420             layerParams.type = "Pooling";
421             layerParams.set("pool", layer_type == "GlobalMaxPool"? "MAX" : "AVE");
422             layerParams.set("global_pooling", layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool");
423
424             if (layer_type == "ReduceMean")
425             {
426                 if (layerParams.get<int>("keepdims") == 0 || !layerParams.has("axes"))
427                     CV_Error(Error::StsNotImplemented, "Unsupported mode of ReduceMean operation.");
428
429                 MatShape inpShape = outShapes[node_proto.input(0)];
430                 if (inpShape.size() != 4 && inpShape.size() != 5)
431                     CV_Error(Error::StsNotImplemented, "Unsupported input shape of reduce_mean operation.");
432
433                 DictValue axes = layerParams.get("axes");
434                 CV_Assert(axes.size() <= inpShape.size() - 2);
435                 std::vector<int> kernel_size(inpShape.size() - 2, 1);
436                 for (int i = 0; i < axes.size(); i++) {
437                     int axis = axes.get<int>(i);
438                     CV_Assert_N(axis >= 2 + i, axis < inpShape.size());
439                     kernel_size[axis - 2] = inpShape[axis];
440                 }
441
442                 layerParams.set("kernel_size", DictValue::arrayInt(&kernel_size[0], kernel_size.size()));
443             }
444         }
445         else if (layer_type == "Slice")
446         {
447             if (layerParams.has("steps")) {
448                 DictValue steps = layerParams.get("steps");
449                 for (int i = 0; i < steps.size(); ++i) {
450                     if (steps.get<int>(i) != 1)
451                         CV_Error(Error::StsNotImplemented,
452                                  "Slice layer only supports steps = 1");
453                 }
454             }
455
456             int axis = 0;
457             if (layerParams.has("axes")) {
458                 DictValue axes = layerParams.get("axes");
459                 for (int i = 1; i < axes.size(); ++i) {
460                     CV_Assert(axes.get<int>(i - 1) == axes.get<int>(i) - 1);
461                 }
462                 axis = axes.get<int>(0);
463             }
464             layerParams.set("axis", axis);
465
466             DictValue starts = layerParams.get("starts");
467             DictValue ends = layerParams.get("ends");
468             CV_Assert(starts.size() == ends.size());
469
470             std::vector<int> begin;
471             std::vector<int> end;
472             if (axis > 0) {
473                 begin.resize(axis, 0);
474                 end.resize(axis, -1);
475             }
476
477             for (int i = 0; i < starts.size(); ++i)
478             {
479                 begin.push_back(starts.get<int>(i));
480                 int finish = ends.get<int>(i);
481                 end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
482             }
483             layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
484             layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
485          }
486         else if (layer_type == "Split")
487         {
488             if (layerParams.has("split"))
489             {
490                 DictValue splits = layerParams.get("split");
491                 const int numSplits = splits.size();
492                 CV_Assert(numSplits > 1);
493
494                 std::vector<int> slicePoints(numSplits - 1, splits.get<int>(0));
495                 for (int i = 1; i < splits.size() - 1; ++i)
496                 {
497                     slicePoints[i] = slicePoints[i - 1] + splits.get<int>(i - 1);
498                 }
499                 layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
500             }
501             else
502             {
503                 layerParams.set("num_split", node_proto.output_size());
504             }
505             layerParams.type = "Slice";
506         }
507         else if (layer_type == "Add" || layer_type == "Sum")
508         {
509             if (layer_id.find(node_proto.input(1)) == layer_id.end())
510             {
511                 Mat blob = getBlob(node_proto, constBlobs, 1);
512                 blob = blob.reshape(1, 1);
513                 if (blob.total() == 1) {
514                     layerParams.type = "Power";
515                     layerParams.set("shift", blob.at<float>(0));
516                 }
517                 else {
518                     layerParams.type = "Scale";
519                     layerParams.set("bias_term", true);
520                     layerParams.blobs.push_back(blob);
521                 }
522             }
523             else {
524                 layerParams.type = "Eltwise";
525             }
526         }
527         else if (layer_type == "Max")
528         {
529             layerParams.type = "Eltwise";
530             layerParams.set("operation", "max");
531         }
532         else if (layer_type == "Sub")
533         {
534             Mat blob = getBlob(node_proto, constBlobs, 1);
535             if (blob.total() == 1) {
536                 layerParams.type = "Power";
537                 layerParams.set("shift", -blob.at<float>(0));
538             }
539             else {
540                 layerParams.type = "Scale";
541                 layerParams.set("has_bias", true);
542                 layerParams.blobs.push_back(-1.0f * blob.reshape(1, 1));
543             }
544         }
545         else if (layer_type == "Div")
546         {
547             if (constBlobs.find(node_proto.input(1)) == constBlobs.end())
548             {
549                 layerParams.type = "Eltwise";
550                 layerParams.set("operation", "div");
551             }
552             else
553             {
554                 Mat blob = getBlob(node_proto, constBlobs, 1);
555                 CV_Assert_N(blob.type() == CV_32F, blob.total());
556                 if (blob.total() == 1)
557                 {
558                     layerParams.set("scale", 1.0f / blob.at<float>(0));
559                     layerParams.type = "Power";
560                 }
561                 else
562                 {
563                     layerParams.type = "Scale";
564                     divide(1.0, blob, blob);
565                     layerParams.blobs.push_back(blob);
566                     layerParams.set("bias_term", false);
567                 }
568             }
569         }
570         else if (layer_type == "Neg")
571         {
572             layerParams.type = "Power";
573             layerParams.set("scale", -1);
574         }
575         else if (layer_type == "Constant")
576         {
577             CV_Assert(node_proto.input_size() == 0);
578             CV_Assert(layerParams.blobs.size() == 1);
579             constBlobs.insert(std::make_pair(layerParams.name, layerParams.blobs[0]));
580             continue;
581         }
582         else if (layer_type == "ImageScaler")
583         {
584             const float scale = layerParams.has("scale") ? layerParams.get<float>("scale") : 1.0f;
585             layerParams.erase("scale");
586
587             if (layerParams.has("bias"))
588             {
589                 layerParams.type = "Scale";
590                 layerParams.blobs.push_back(
591                     Mat(Size(1,  layerParams.get("bias").size()), CV_32FC1, scale));
592
593                 layerParams.set("bias_term", true);
594                 Mat bias(1, layerParams.get("bias").size(), CV_32FC1);
595                 for (int j = 0; j < bias.total(); j++) {
596                     bias.at<float>(0, j) = layerParams.get("bias").getRealValue(j);
597                 }
598                 layerParams.blobs.push_back(bias);
599                 layerParams.erase("bias");
600             }
601             else {
602                 layerParams.set("scale", scale);
603                 layerParams.type = "Power";
604             }
605         }
606         else if (layer_type == "Clip")
607         {
608             layerParams.type = "ReLU6";
609             replaceLayerParam(layerParams, "min", "min_value");
610             replaceLayerParam(layerParams, "max", "max_value");
611
612         }
613         else if (layer_type == "LeakyRelu")
614         {
615             layerParams.type = "ReLU";
616             replaceLayerParam(layerParams, "alpha", "negative_slope");
617         }
618         else if (layer_type == "LRN")
619         {
620             replaceLayerParam(layerParams, "size", "local_size");
621         }
622         else if (layer_type == "InstanceNormalization")
623         {
624             if (node_proto.input_size() != 3)
625                 CV_Error(Error::StsNotImplemented,
626                          "Expected input, scale, bias");
627
628             layerParams.blobs.resize(4);
629             layerParams.blobs[2] = getBlob(node_proto, constBlobs, 1);  // weightData
630             layerParams.blobs[3] = getBlob(node_proto, constBlobs, 2);  // biasData
631             layerParams.set("has_bias", true);
632             layerParams.set("has_weight", true);
633
634             // Get number of channels in input
635             int size = layerParams.blobs[2].total();
636             layerParams.blobs[0] = Mat::zeros(size, 1, CV_32F); // mean
637             layerParams.blobs[1] = Mat::ones(size, 1, CV_32F); // std
638
639             LayerParams mvnParams;
640             mvnParams.name = layerParams.name + "/MVN";
641             mvnParams.type = "MVN";
642             mvnParams.set("eps", layerParams.get<float>("epsilon"));
643             layerParams.erase("epsilon");
644
645             //Create MVN layer
646             int id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
647             //Connect to input
648             layerId = layer_id.find(node_proto.input(0));
649             CV_Assert(layerId != layer_id.end());
650             dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
651             //Add shape
652             layer_id.insert(std::make_pair(mvnParams.name, LayerInfo(id, 0)));
653             outShapes[mvnParams.name] = outShapes[node_proto.input(0)];
654
655             //Replace Batch Norm's input to MVN
656             node_proto.set_input(0, mvnParams.name);
657             layerParams.type = "BatchNorm";
658         }
659         else if (layer_type == "BatchNormalization")
660         {
661             if (node_proto.input_size() != 5)
662                 CV_Error(Error::StsNotImplemented,
663                          "Expected input, scale, bias, mean and var");
664
665             layerParams.type = "BatchNorm";
666             replaceLayerParam(layerParams, "epsilon", "eps");
667             replaceLayerParam(layerParams, "spatial", "use_global_stats");
668
669             Mat meanData = getBlob(node_proto, constBlobs, 3);
670             Mat stdData =  getBlob(node_proto, constBlobs, 4);
671
672             layerParams.blobs.push_back(meanData);
673             layerParams.blobs.push_back(stdData);
674
675             if (!node_proto.input(1).empty()) {
676                 layerParams.set("has_weight", true);
677                 layerParams.blobs.push_back(getBlob(node_proto, constBlobs, 1));  // weightData
678             } else {
679                 layerParams.set("has_weight", false);
680             }
681
682             if (!node_proto.input(2).empty()) {
683                 layerParams.set("has_bias", true);
684                 layerParams.blobs.push_back(getBlob(node_proto, constBlobs, 2)); // biasData
685             } else {
686                 layerParams.set("has_bias", false);
687             }
688         }
689         else if (layer_type == "Gemm")
690         {
691             CV_Assert(node_proto.input_size() >= 2);
692             layerParams.type = "InnerProduct";
693             Mat weights = getBlob(node_proto, constBlobs, 1);
694             int ind_num_out = 0;
695             if (layerParams.has("transB") && !layerParams.get<int>("transB")) {
696                 transpose(weights, weights);
697                 ind_num_out = 1;
698             }
699             layerParams.blobs.push_back(weights);
700
701             if (node_proto.input_size() == 3) {
702                 Mat bias = getBlob(node_proto, constBlobs, 2);
703                 layerParams.blobs.push_back(bias);
704             }
705
706             layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
707             layerParams.set("bias_term", node_proto.input_size() == 3);
708         }
709         else if (layer_type == "MatMul")
710         {
711             CV_Assert(node_proto.input_size() == 2);
712             layerParams.type = "InnerProduct";
713             Mat blob = getBlob(node_proto, constBlobs, 1);
714             layerParams.blobs.push_back(blob.t());
715             layerParams.set("bias_term", false);
716             layerParams.set("num_output", layerParams.blobs[0].size[0]);
717         }
718         else if (layer_type == "Mul")
719         {
720             CV_Assert(node_proto.input_size() == 2);
721             if (layer_id.find(node_proto.input(1)) == layer_id.end()) {
722                 Mat blob = getBlob(node_proto, constBlobs, 1);
723                 blob = blob.reshape(1, 1);
724                 if (blob.total() == 1) {
725                     layerParams.set("scale", blob.at<float>(0));
726                     layerParams.type = "Power";
727                 }
728                 else {
729                     layerParams.blobs.push_back(blob);
730                     layerParams.type = "Scale";
731                 }
732             }
733             else {
734                 layerParams.type = "Eltwise";
735                 layerParams.set("operation", "prod");
736             }
737         }
738         else if (layer_type == "Conv")
739         {
740             CV_Assert(node_proto.input_size() >= 2);
741             layerParams.type = "Convolution";
742             for (int j = 1; j < node_proto.input_size(); j++) {
743                 layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
744             }
745             layerParams.set("num_output", layerParams.blobs[0].size[0]);
746             layerParams.set("bias_term", node_proto.input_size() == 3);
747         }
748         else if (layer_type == "ConvTranspose")
749         {
750             CV_Assert(node_proto.input_size() >= 2);
751             layerParams.type = "Deconvolution";
752             for (int j = 1; j < node_proto.input_size(); j++) {
753                 layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
754             }
755             layerParams.set("num_output", layerParams.blobs[0].size[1] * layerParams.get<int>("group", 1));
756             layerParams.set("bias_term", node_proto.input_size() == 3);
757
758             if (!layerParams.has("kernel_size"))
759                 CV_Error(Error::StsNotImplemented,
760                          "Required attribute 'kernel_size' is not present.");
761
762             if (layerParams.has("output_shape"))
763             {
764                 const DictValue& outShape = layerParams.get("output_shape");
765                 DictValue strides = layerParams.get("stride");
766                 DictValue kernel = layerParams.get("kernel_size");
767
768                 String padMode;
769                 std::vector<int> adjust_pads;
770                 if (layerParams.has("pad_mode"))
771                 {
772                     padMode = toUpperCase(layerParams.get<String>("pad_mode"));
773                     if (padMode != "SAME" && padMode != "VALID")
774                         CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
775
776                     for (int i = 0; i < strides.size(); i++)
777                     {
778                         int sz = outShape.get<int>(2 + i);
779                         int stride = strides.get<int>(i);
780                         adjust_pads.push_back(padMode == "SAME"? (sz - 1) % stride :
781                                                                  (sz - kernel.get<int>(i)) % stride);
782                     }
783                     layerParams.set("adj", DictValue::arrayInt(&adjust_pads[0], adjust_pads.size()));
784                 }
785             }
786             else if (layerParams.has("output_padding"))
787             {
788                 replaceLayerParam(layerParams, "output_padding", "adj");
789             }
790         }
791         else if (layer_type == "Transpose")
792         {
793             layerParams.type = "Permute";
794             replaceLayerParam(layerParams, "perm", "order");
795
796             CV_Assert(node_proto.input_size() == 1);
797             if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
798             {
799                 std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), transposed;
800                 runLayer(layerParams, inputs, transposed);
801                 CV_Assert(transposed.size() == 1);
802                 constBlobs.insert(std::make_pair(layerParams.name, transposed[0]));
803                 continue;
804             }
805         }
806         else if (layer_type == "ReduceL2")
807         {
808             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
809             CV_Assert(graph_proto.node_size() > li + 1 && graph_proto.node(li + 1).op_type() == "Div");
810             ++li;
811             node_proto = graph_proto.node(li);
812             layerParams.name = node_proto.output(0);
813             layerParams.type = "Normalize";
814
815             DictValue axes_dict = layerParams.get("axes");
816             if (axes_dict.size() != 1)
817                 CV_Error(Error::StsNotImplemented, "Multidimensional reduceL2");
818             int axis = axes_dict.getIntValue(0);
819             layerParams.set("axis",axis);
820             layerParams.set("end_axis", axis);
821         }
822         else if (layer_type == "Squeeze")
823         {
824             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
825             DictValue axes_dict = layerParams.get("axes");
826             if (axes_dict.size() != 1)
827                 CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
828
829             int axis = axes_dict.getIntValue(0);
830             layerParams.set("axis", axis - 1);
831             layerParams.set("end_axis", axis);
832             layerParams.type = "Flatten";
833         }
834         else if (layer_type == "Unsqueeze")
835         {
836             CV_Assert(node_proto.input_size() == 1);
837             DictValue axes = layerParams.get("axes");
838             if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
839             {
840                 // Constant input.
841                 Mat input = getBlob(node_proto, constBlobs, 0);
842
843                 std::vector<int> dims;
844                 for (int j = 0; j < input.dims; j++) {
845                     dims.push_back(input.size[j]);
846                 }
847                 CV_Assert(axes.getIntValue(axes.size()-1) <= dims.size());
848                 for (int j = 0; j < axes.size(); j++) {
849                     dims.insert(dims.begin() + axes.getIntValue(j), 1);
850                 }
851
852                 Mat out = input.reshape(0, dims);
853                 constBlobs.insert(std::make_pair(layerParams.name, out));
854                 continue;
855             }
856
857             // Variable input.
858             if (axes.size() != 1)
859                 CV_Error(Error::StsNotImplemented, "Multidimensional unsqueeze");
860
861             MatShape inpShape = outShapes[node_proto.input(0)];
862             int axis = axes.getIntValue(0);
863             CV_Assert(0 <= axis && axis <= inpShape.size());
864             std::vector<int> outShape = inpShape;
865             outShape.insert(outShape.begin() + axis, 1);
866             layerParams.type = "Reshape";
867             layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
868         }
869         else if (layer_type == "Reshape")
870         {
871             CV_Assert(node_proto.input_size() == 2 || layerParams.has("shape"));
872
873             if (node_proto.input_size() == 2) {
874                 Mat blob = getBlob(node_proto, constBlobs, 1);
875                 CV_Assert(blob.type() == CV_32SC1);
876
877                 layerParams.set("dim", DictValue::arrayInt<int*>(
878                             blob.ptr<int>(), blob.total() ));
879
880                 if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
881                     std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), outputs;
882                     runLayer(layerParams, inputs, outputs);
883                     constBlobs.insert(std::make_pair(layerParams.name, outputs[0]));
884                     continue;
885                 }
886             }
887             else {
888                 DictValue shape = layerParams.get("shape");
889                 std::vector<int> dim;
890                 for (int j = 0; j < shape.size(); j++) {
891                     dim.push_back(shape.getIntValue(j));
892                 }
893
894                 if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
895                     Mat input = getBlob(node_proto, constBlobs, 0);
896                     Mat out = input.reshape(0, dim);
897                     constBlobs.insert(std::make_pair(layerParams.name, out));
898                     continue;
899                 }
900                 replaceLayerParam(layerParams, "shape", "dim");
901             }
902         }
903         else if (layer_type == "Pad")
904         {
905             layerParams.type = "Padding";
906         }
907         else if (layer_type == "Shape")
908         {
909             CV_Assert(node_proto.input_size() == 1);
910             shapeIt = outShapes.find(node_proto.input(0));
911             CV_Assert(shapeIt != outShapes.end());
912             MatShape inpShape = shapeIt->second;
913
914             Mat shapeMat(inpShape.size(), 1, CV_32S);
915             for (int j = 0; j < inpShape.size(); ++j)
916                 shapeMat.at<int>(j) = inpShape[j];
917             shapeMat.dims = 1;
918
919             constBlobs.insert(std::make_pair(layerParams.name, shapeMat));
920             continue;
921         }
922         else if (layer_type == "Gather")
923         {
924             CV_Assert(node_proto.input_size() == 2);
925             CV_Assert(layerParams.has("axis"));
926             Mat input = getBlob(node_proto, constBlobs, 0);
927             Mat indexMat = getBlob(node_proto, constBlobs, 1);
928             CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1);
929             int index = indexMat.at<int>(0);
930             int axis = layerParams.get<int>("axis");
931
932             std::vector<cv::Range> ranges(input.dims, Range::all());
933             ranges[axis] = Range(index, index + 1);
934
935             Mat out = input(ranges);
936             constBlobs.insert(std::make_pair(layerParams.name, out));
937             continue;
938         }
939         else if (layer_type == "Concat")
940         {
941             bool hasVariableInps = false;
942             for (int i = 0; i < node_proto.input_size(); ++i)
943             {
944                 if (layer_id.find(node_proto.input(i)) != layer_id.end())
945                 {
946                     hasVariableInps = true;
947                     break;
948                 }
949             }
950
951             if (!hasVariableInps)
952             {
953                 std::vector<Mat> inputs(node_proto.input_size()), concatenated;
954                 for (size_t i = 0; i < inputs.size(); ++i)
955                 {
956                     inputs[i] = getBlob(node_proto, constBlobs, i);
957                 }
958                 runLayer(layerParams, inputs, concatenated);
959
960                 CV_Assert(concatenated.size() == 1);
961                 constBlobs.insert(std::make_pair(layerParams.name, concatenated[0]));
962                 continue;
963             }
964         }
965         else if (layer_type == "Upsample")
966         {
967             layerParams.type = "Resize";
968             if (layerParams.has("scales"))
969             {
970                 // Pytorch layer
971                 DictValue scales = layerParams.get("scales");
972                 CV_Assert(scales.size() == 4);
973                 layerParams.set("zoom_factor_y", scales.getIntValue(2));
974                 layerParams.set("zoom_factor_x", scales.getIntValue(3));
975             }
976             else
977             {
978                 // Caffe2 layer
979                 replaceLayerParam(layerParams, "height_scale", "zoom_factor_y");
980                 replaceLayerParam(layerParams, "width_scale", "zoom_factor_x");
981             }
982             replaceLayerParam(layerParams, "mode", "interpolation");
983
984             if (layerParams.get<String>("interpolation") == "linear" && framework_name == "pytorch") {
985                 layerParams.type = "Resize";
986                 Mat scales = getBlob(node_proto, constBlobs, 1);
987                 CV_Assert(scales.total() == 4);
988                 layerParams.set("interpolation", "opencv_linear");
989                 layerParams.set("zoom_factor_y", scales.at<float>(2));
990                 layerParams.set("zoom_factor_x", scales.at<float>(3));
991             }
992         }
993         else if (layer_type == "LogSoftmax")
994         {
995             layerParams.type = "Softmax";
996             layerParams.set("log_softmax", true);
997         }
998         else
999         {
1000             for (int j = 0; j < node_proto.input_size(); j++) {
1001                 if (layer_id.find(node_proto.input(j)) == layer_id.end())
1002                     layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
1003             }
1004         }
1005
1006         int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
1007         for (int i = 0; i < node_proto.output_size(); ++i)
1008         {
1009             layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
1010         }
1011
1012         std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
1013         for (int j = 0; j < node_proto.input_size(); j++) {
1014             layerId = layer_id.find(node_proto.input(j));
1015             if (layerId != layer_id.end()) {
1016                 dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, j);
1017                 // Collect input shapes.
1018                 shapeIt = outShapes.find(node_proto.input(j));
1019                 CV_Assert(shapeIt != outShapes.end());
1020                 layerInpShapes.push_back(shapeIt->second);
1021             }
1022         }
1023
1024         // Compute shape of output blob for this layer.
1025         Ptr<Layer> layer = dstNet.getLayer(id);
1026         layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
1027         for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
1028         {
1029             outShapes[node_proto.output(i)] = layerOutShapes[i];
1030         }
1031     }
1032 }
1033
1034 Net readNetFromONNX(const String& onnxFile)
1035 {
1036     ONNXImporter onnxImporter(onnxFile.c_str());
1037     Net net;
1038     onnxImporter.populateNet(net);
1039     return net;
1040 }
1041
1042 Net readNetFromONNX(const char* buffer, size_t sizeBuffer)
1043 {
1044     ONNXImporter onnxImporter(buffer, sizeBuffer);
1045     Net net;
1046     onnxImporter.populateNet(net);
1047     return net;
1048 }
1049
1050 Net readNetFromONNX(const std::vector<uchar>& buffer)
1051 {
1052     return readNetFromONNX(reinterpret_cast<const char*>(buffer.data()), buffer.size());
1053 }
1054
1055 Mat readTensorFromONNX(const String& path)
1056 {
1057     opencv_onnx::TensorProto tensor_proto = opencv_onnx::TensorProto();
1058     std::fstream input(path.c_str(), std::ios::in | std::ios::binary);
1059     if (!tensor_proto.ParseFromIstream(&input)) {
1060         CV_Error(Error::StsUnsupportedFormat, "Failed to parse data");
1061     }
1062     Mat mat = getMatFromTensor(tensor_proto);
1063     releaseONNXTensor(tensor_proto);
1064     return mat;
1065 }
1066
1067 CV__DNN_EXPERIMENTAL_NS_END
1068 }} // namespace
1069
1070 #endif