[CPU] Generic JIT Eltwise implementation (#1464)
[platform/upstream/dldt.git] / inference-engine / src / mkldnn_plugin / mkldnn_plugin.cpp
1 // Copyright (C) 2018-2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "ie_metric_helpers.hpp"
6 #include "mkldnn_plugin.h"
7 #include "mkldnn_extension_mngr.h"
8 #include "mkldnn_weights_cache.hpp"
9 #include "mkldnn_itt.h"
10
11 #include <legacy/net_pass.h>
12 #include <threading/ie_executor_manager.hpp>
13 #include <memory>
14 #include <ie_plugin_config.hpp>
15 #include <vector>
16 #include <tuple>
17 #include <ie_system_conf.h>
18 #include <generic_ie.hpp>
19 #include <nodes/list.hpp>
20 #include <legacy/ie_util_internal.hpp>
21 #include <legacy/graph_transformer.h>
22 #include <legacy/ie_ngraph_utils.hpp>
23
24 #include <legacy/convert_function_to_cnn_network.hpp>
25 #include <legacy/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
26 #include <legacy/transformations/convert_opset1_to_legacy/convert_prior_to_ie_prior.hpp>
27 #include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
28 #include <legacy/ngraph_ops/fully_connected.hpp>
29
30 #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
31 #include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
32
33 #include <transformations/common_optimizations/common_optimizations.hpp>
34 #include <transformations/common_optimizations/depth_to_space_fusion.hpp>
35 #include <transformations/op_conversions/convert_depth_to_space.hpp>
36 #include <transformations/op_conversions/convert_space_to_depth.hpp>
37 #include <transformations/op_conversions/convert_gelu.hpp>
38 #include <transformations/op_conversions/hswish_decomposition.hpp>
39 #include <transformations/op_conversions/hsigmoid_decomposition.hpp>
40 #include <transformations/op_conversions/reduce_l1_decomposition.hpp>
41 #include <transformations/op_conversions/reduce_l2_decomposition.hpp>
42 #include <transformations/op_conversions/convert_pad_to_group_conv.hpp>
43 #include <transformations/op_conversions/softplus_decomposition.hpp>
44 #include <transformations/op_conversions/convert_space_to_batch.hpp>
45 #include <transformations/op_conversions/convert_batch_to_space.hpp>
46 #include <transformations/op_conversions/convert_mod.hpp>
47 #include <transformations/convert_precision.hpp>
48 #include <transformations/init_node_info.hpp>
49 #include <transformations/rt_info/fused_names_attribute.hpp>
50
51 #include <ngraph/opsets/opset2.hpp>
52 #include <ngraph/opsets/opset3.hpp>
53 #include <ngraph/opsets/opset4.hpp>
54 #include <ngraph/op/util/op_types.hpp>
55 #include <ngraph/pass/manager.hpp>
56
57 #include <transformations/common_optimizations/lin_op_sequence_fusion.hpp>
58
59 #ifndef USE_CNNNETWORK_LPT
60 # include <low_precision/transformer.hpp>
61 # include <low_precision/convolution.hpp>
62 # include <low_precision/group_convolution.hpp>
63 # include <low_precision/multiply_to_group_convolution.hpp>
64 #endif
65
66 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
67 #if defined(_WIN32) || defined(WIN32)
68 #include <intrin.h>
69 #include <windows.h>
70 #else
71 #include <cpuid.h>
72
73 #endif
74 #endif
75
76 using namespace MKLDNNPlugin;
77 using namespace InferenceEngine;
78
79 Engine::Engine() {
80     _pluginName = "CPU";
81     extensionManager->AddExtension(std::make_shared<Extensions::Cpu::MKLDNNExtensions>());
82 }
83
84 Engine::~Engine() {
85     ExecutorManager::getInstance()->clear("CPUStreamsExecutor");
86     ExecutorManager::getInstance()->clear("CPUCallbackExecutor");
87 }
88
89 static void Transformation(ICNNNetwork::Ptr& clonedNetwork, const Config& conf) {
90     OV_ITT_SCOPED_TASK(MKLDNNPlugin::itt::domains::MKLDNNPlugin, "Transformation");
91
92     auto nGraphFunc = clonedNetwork->getFunction();
93     // Disable shape inference (WA for generic operations)
94     ngraph::op::GenericIE::DisableReshape noReshape(nGraphFunc);
95
96     ngraph::pass::Manager manager;
97     manager.register_pass<ngraph::pass::InitNodeInfo>();
98     // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
99     manager.register_pass<ngraph::pass::ConvertPriorBox>();
100     manager.register_pass<ngraph::pass::CommonOptimizations>();
101     manager.register_pass<ngraph::pass::ConvertOpSet3ToOpSet2>();
102     manager.register_pass<ngraph::pass::ConvertOpSet2ToOpSet1>();
103
104     std::vector<std::pair<ngraph::element::Type, ngraph::element::Type>> convert_precision_list {
105             {ngraph::element::i64, ngraph::element::i32},
106             {ngraph::element::u64, ngraph::element::i32},
107             {ngraph::element::u16, ngraph::element::i32},
108             {ngraph::element::u32, ngraph::element::i32},
109             {ngraph::element::f16, ngraph::element::f32},
110             {ngraph::element::boolean, ngraph::element::u8},
111     };
112
113     for (auto & precision : convert_precision_list) {
114         manager.register_pass<ngraph::pass::ConvertPrecision>(precision.first, precision.second);
115     }
116
117     auto pass_config = manager.get_pass_config();
118
119     using const_node_ptr = const std::shared_ptr<const ngraph::Node>;
120
121     // SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5
122     pass_config->set_callback<ngraph::pass::ConvertSpaceToDepth,
123                               ngraph::pass::ConvertDepthToSpace>(
124             [](const_node_ptr &node) -> bool {
125                 return node->input_value(0).get_shape().size() <= 5lu &&
126                        node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
127             });
128
129     // Disable FC reshaping for 3D case
130     pass_config->set_callback<ngraph::pass::ReshapeFullyConnected>(
131             [](const_node_ptr &node) -> bool {
132                 return node->input_value(0).get_shape().size() == 3ul;
133             });
134
135     pass_config->set_callback<ngraph::pass::ConvertBatchToSpace,
136                               ngraph::pass::ConvertSpaceToBatch>(
137             [](const_node_ptr &node) -> bool {
138                 const auto & rank = node->input(0).get_partial_shape().rank().get_length();
139                 return rank == 4lu || rank == 5lu;
140             });
141
142     // List of enabled/disabled transformations
143     pass_config->disable<ngraph::pass::ConvertGELU>();
144     pass_config->disable<ngraph::pass::HSwishDecomposition>();
145     pass_config->disable<ngraph::pass::ReduceL1Decomposition>();
146     pass_config->disable<ngraph::pass::ReduceL2Decomposition>();
147     pass_config->disable<ngraph::pass::SoftPlusDecomposition>();
148     pass_config->disable<ngraph::pass::HSigmoidDecomposition>();
149     pass_config->disable<ngraph::pass::ConvertMod>();
150
151     pass_config->enable<ngraph::pass::ConvertPadToGroupConvolution>();
152
153     manager.run_passes(nGraphFunc);
154
155 #ifndef USE_CNNNETWORK_LPT
156     using namespace ngraph::pass::low_precision;
157     if (conf.lpTransformsMode == Config::LPTransformsMode::On) {
158         auto params = LayerTransformation::Params(
159             true,  // updatePrecisions
160             LayerTransformation::QuantizedTensorAlignment::UpdateLevel,  // quantizedTensorAlignmentOnActivations
161             LayerTransformation::QuantizedTensorAlignment::None,  // quantizedTensorAlignmentOnWeights
162             true);  // supportAsymmetricQuantization
163         LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params)
164             .add<ConvolutionTransformation, ngraph::opset1::Convolution>(
165                 LayerTransformation::Params(params).setPrecisionsOnActivations({ngraph::element::u8}).setSupportAsymmetricQuantization(true))
166             .add<GroupConvolutionTransformation, ngraph::opset1::GroupConvolution>(
167                 LayerTransformation::Params(params).setPrecisionsOnActivations({ ngraph::element::u8 }).setSupportAsymmetricQuantization(true))
168             .addStandaloneCleanup<MultiplyToGroupConvolutionTransformation, ngraph::opset1::Multiply>(
169                 LayerTransformation::Params(params).setPrecisionsOnActivations({ ngraph::element::u8 })));
170
171         transformer.transform(nGraphFunc);
172     }
173 #endif
174
175     ngraph::pass::Manager legacyManager;
176     legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
177     legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
178
179     auto legacyPassConfig = manager.get_pass_config();
180     legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
181         if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
182             auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
183             auto constant = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(mul_op->get_input_node_shared_ptr(1));
184             bool is_dequantization = mul_op->get_rt_info().count("DEQUANTIZATION") != 0;
185             if (add_op && constant && is_dequantization) {
186                 return ngraph::is_type<ngraph::opset1::Convolution>(add_op->get_input_node_shared_ptr(0)) ||
187                     ngraph::is_type<ngraph::opset1::GroupConvolution>(add_op->get_input_node_shared_ptr(0)) ||
188                     ngraph::is_type<ngraph::opset1::MatMul>(add_op->get_input_node_shared_ptr(0));
189             }
190         }
191         return false;
192     });
193
194     legacyManager.run_passes(nGraphFunc);
195
196     clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, *clonedNetwork);
197
198     // WA: after conversion to CNNNetwork user precision can redefine input/output precisions
199     // so we need to apply additional precision conversion but only for inputs and outputs
200     for (auto & precision : convert_precision_list) {
201         NetPass::ConvertIOPrecision(*clonedNetwork, convertPrecision(precision.first), convertPrecision(precision.second));
202     }
203 }
204
205 InferenceEngine::ExecutableNetworkInternal::Ptr
206 Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config) {
207     OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "Engine::LoadExeNetworkImpl");
208
209     // verification of supported input
210     InferenceEngine::InputsDataMap _networkInputs;
211     network.getInputsInfo(_networkInputs);
212     for (const auto &ii : _networkInputs) {
213         auto input_precision = ii.second->getPrecision();
214         if (input_precision != InferenceEngine::Precision::FP32 &&
215             input_precision != InferenceEngine::Precision::I32 &&
216             input_precision != InferenceEngine::Precision::U16 &&
217             input_precision != InferenceEngine::Precision::I16 &&
218             input_precision != InferenceEngine::Precision::I8 &&
219             input_precision != InferenceEngine::Precision::U8 &&
220             input_precision != InferenceEngine::Precision::BOOL) {
221             THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
222                                << "Input image format " << input_precision << " is not supported yet...";
223         }
224     }
225
226     // TODO: handle input precision differently - per input and not one per network...
227
228     // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
229     Config conf = engConfig;
230     conf.readProperties(config);
231
232     if (conf.enableDynamicBatch) {
233         conf.batchLimit = static_cast<int>(network.getBatchSize());
234     }
235
236     std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
237     bool is_transformed = false;
238     if (clonedNetwork->getFunction()) {
239         Transformation(clonedNetwork, conf);
240         is_transformed = true;
241     }
242     auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(clonedNetwork);
243     if (implNetwork) {
244         // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
245         ConstTransformer transformator(implNetwork.get());
246         transformator.fullTrim();
247         if (!is_transformed) {
248             NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
249             NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
250             NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
251             NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
252             NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
253             NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
254         }
255     }
256
257     return std::make_shared<MKLDNNExecNetwork>(*clonedNetwork, conf, extensionManager, weightsSharing);
258 }
259
260 void Engine::SetConfig(const std::map<std::string, std::string> &config) {
261     // accumulate config parameters on engine level
262     engConfig.readProperties(config);
263 }
264
265 Parameter Engine::GetConfig(const std::string& name, const std::map<std::string, Parameter>& /*options*/) const {
266     Parameter result;
267     auto option = engConfig._config.find(name);
268     if (option != engConfig._config.end()) {
269         result = option->second;
270     } else {
271         THROW_IE_EXCEPTION << "Unsupported config key " << name;
272     }
273     return result;
274 }
275
276 static bool hasAVX512() {
277 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
278     unsigned int regs[4] = {7, 0, 0, 0};
279 #if defined(_WIN32) || defined(WIN32)
280     __cpuid(reinterpret_cast<int*>(regs), regs[0]);
281 #else
282     __cpuid_count(regs[0], regs[1], regs[0], regs[1], regs[2], regs[3]);
283 #endif
284     if (regs[1] & (1U << 16))
285         return true;
286 #endif
287     return false;
288 }
289
290 Parameter Engine::GetMetric(const std::string& name, const std::map<std::string, Parameter>& /*options*/) const {
291     if (name == METRIC_KEY(SUPPORTED_METRICS)) {
292         std::vector<std::string> metrics;
293         metrics.push_back(METRIC_KEY(AVAILABLE_DEVICES));
294         metrics.push_back(METRIC_KEY(SUPPORTED_METRICS));
295         metrics.push_back(METRIC_KEY(FULL_DEVICE_NAME));
296         metrics.push_back(METRIC_KEY(OPTIMIZATION_CAPABILITIES));
297         metrics.push_back(METRIC_KEY(SUPPORTED_CONFIG_KEYS));
298         metrics.push_back(METRIC_KEY(RANGE_FOR_ASYNC_INFER_REQUESTS));
299         metrics.push_back(METRIC_KEY(RANGE_FOR_STREAMS));
300         IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
301     } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
302         std::string brand_string;
303 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
304         unsigned int addr_list[3] = { 0x80000002, 0x80000003, 0x80000004 };
305         unsigned int regs[4];
306         for (auto addr : addr_list) {
307             regs[0] = addr;
308 #if defined(_WIN32) || defined(WIN32)
309             __cpuid(reinterpret_cast<int*>(regs), regs[0]);
310 #else
311             __get_cpuid(regs[0], &regs[0], &regs[1], &regs[2], &regs[3]);
312 #endif
313             char *ch = reinterpret_cast<char*>(&regs[0]);
314             for (size_t j = 0; j < sizeof(regs); j++)
315                 brand_string += ch[j];
316         }
317 #else
318         brand_string = "Non Intel Architecture";
319 #endif
320         IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, brand_string);
321     } else if (name == METRIC_KEY(AVAILABLE_DEVICES)) {
322         std::vector<std::string> availableDevices = { "" };
323         IE_SET_METRIC_RETURN(AVAILABLE_DEVICES, availableDevices);
324     } else if (name == METRIC_KEY(OPTIMIZATION_CAPABILITIES)) {
325         std::vector<std::string> capabilities;
326         if (with_cpu_x86_bfloat16())
327             capabilities.push_back(METRIC_VALUE(BF16));
328         if (hasAVX512())
329             capabilities.push_back(METRIC_VALUE(WINOGRAD));
330         capabilities.push_back(METRIC_VALUE(FP32));
331         capabilities.push_back(METRIC_VALUE(FP16));
332         capabilities.push_back(METRIC_VALUE(INT8));
333         capabilities.push_back(METRIC_VALUE(BIN));
334         IE_SET_METRIC_RETURN(OPTIMIZATION_CAPABILITIES, capabilities);
335     } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
336         std::vector<std::string> configKeys;
337         for (auto && opt : engConfig._config)
338             configKeys.push_back(opt.first);
339         IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
340     } else if (name == METRIC_KEY(RANGE_FOR_ASYNC_INFER_REQUESTS)) {
341         std::tuple<unsigned int, unsigned int, unsigned int> range = std::make_tuple(1, 1, 1);
342         IE_SET_METRIC_RETURN(RANGE_FOR_ASYNC_INFER_REQUESTS, range);
343     } else if (name == METRIC_KEY(RANGE_FOR_STREAMS)) {
344         std::tuple<unsigned int, unsigned int> range = std::make_tuple(1, parallel_get_max_threads());
345         IE_SET_METRIC_RETURN(RANGE_FOR_STREAMS, range);
346     } else {
347         THROW_IE_EXCEPTION << "Unsupported metric key " << name;
348     }
349 }
350
351 void Engine::AddExtension(InferenceEngine::IExtensionPtr extension) {
352     extensionManager->AddExtension(extension);
353 }
354
355 QueryNetworkResult Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string, std::string>& config) const {
356     QueryNetworkResult res;
357     MKLDNNWeightsSharing::Ptr fake_w_cache;
358     auto function = network.getFunction();
359     if (function != nullptr) {
360         std::unordered_set<std::string> originalOps;
361         for (auto&& node : function->get_ops()) {
362             originalOps.emplace(node->get_friendly_name());
363         }
364
365         // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
366         Config conf = engConfig;
367         conf.readProperties(config);
368
369         if (conf.enableDynamicBatch) {
370             conf.batchLimit = static_cast<int>(network.getBatchSize());
371         }
372
373         auto clonedNetwork = cloneNetwork(network);
374         Transformation(clonedNetwork, conf);
375         std::unordered_set<std::string> supported;
376         std::unordered_set<std::string> unsupported;
377         for (details::CNNNetworkIterator itLayer{clonedNetwork.get()}; itLayer != details::CNNNetworkIterator(); itLayer++) {
378             auto layerIsSupported = [&] {
379                 std::unique_ptr<MKLDNNNode> ptr;
380                 try {
381                     ptr.reset(MKLDNNNode::factory().create(*itLayer, {mkldnn::engine::kind::cpu, 0}, extensionManager, fake_w_cache));
382                 } catch (InferenceEngine::details::InferenceEngineException&) {
383                      return false;
384                 }
385                 return true;
386             } ();
387             for (auto&& fusedLayerName : ngraph::getFusedNamesVector((*itLayer)->getNode())) {
388                 if (contains(originalOps, fusedLayerName)) {
389                     if (layerIsSupported) {
390                         supported.emplace(fusedLayerName);
391                     } else {
392                         unsupported.emplace(fusedLayerName);
393                     }
394                 }
395             }
396         }
397
398         for (auto&& node : function->get_ops()) {
399             if (!contains(unsupported, node->get_friendly_name())) {
400                 for (auto&& inputNodeOutput : node->input_values()) {
401                     if (ngraph::op::is_constant(inputNodeOutput.get_node())) {
402                         supported.emplace(inputNodeOutput.get_node()->get_friendly_name());
403                     }
404                 }
405                 for (auto&& outputs : node->outputs()) {
406                     for (auto&& outputNodeInput : outputs.get_target_inputs()) {
407                         if (ngraph::op::is_output(outputNodeInput.get_node())) {
408                             supported.emplace(outputNodeInput.get_node()->get_friendly_name());
409                         }
410                     }
411                 }
412             }
413         }
414
415         for (auto&& layerName : supported) {
416             if (!contains(unsupported, layerName)) {
417                 res.supportedLayersMap.emplace(layerName, GetName());
418             }
419         }
420     } else {
421         details::CNNNetworkIterator i(&network);
422         while (i != details::CNNNetworkIterator()) {
423             try {
424                 mkldnn::engine eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0));
425                 // if we can create and have not thrown exception, then layer is supported
426                 std::unique_ptr <MKLDNNNode>(MKLDNNNode::factory().create(*i, eng, extensionManager, fake_w_cache));
427                 res.supportedLayersMap.insert({ (*i)->name, GetName() });
428             } catch (InferenceEngine::details::InferenceEngineException&) {
429             }
430             i++;
431         }
432     }
433
434     return res;
435 }
436
437 static const Version version = {{2, 1}, CI_BUILD_NUMBER, "MKLDNNPlugin"};
438 IE_DEFINE_PLUGIN_CREATE_FUNCTION(Engine, version)