Publishing 2019 R1.1 content and Myriad plugin sources (#162)
authorAlexey Suhov <asuhov@users.noreply.github.com>
Mon, 27 May 2019 18:18:32 +0000 (21:18 +0300)
committeropenvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com>
Mon, 27 May 2019 18:18:32 +0000 (21:18 +0300)
* Publishing 2019 R1.1 content and Myriad plugin sources

573 files changed:
README.md
get-started-linux.md [new file with mode: 0644]
inference-engine/CMakeLists.txt
inference-engine/README.md
inference-engine/cmake/check_features.cmake
inference-engine/cmake/dependencies.cmake
inference-engine/cmake/features.cmake
inference-engine/ie_bridges/python/sample/benchmark_app/README.md
inference-engine/ie_bridges/python/sample/classification_sample/README.md
inference-engine/ie_bridges/python/sample/classification_sample_async/README.md
inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md
inference-engine/include/builders/ie_concat_layer.hpp
inference-engine/include/builders/ie_eltwise_layer.hpp
inference-engine/include/builders/ie_lrn_layer.hpp
inference-engine/include/builders/ie_pooling_layer.hpp
inference-engine/include/cpp/ie_plugin_cpp.hpp
inference-engine/include/details/ie_inetwork_iterator.hpp
inference-engine/include/details/os/win_shared_object_loader.h
inference-engine/include/ie_blob.h
inference-engine/include/ie_device.hpp
inference-engine/include/ie_layers.h
inference-engine/include/ie_parallel.hpp
inference-engine/include/ie_precision.hpp
inference-engine/include/vpu/vpu_plugin_config.hpp
inference-engine/install_dependencies.sh
inference-engine/samples/CMakeLists.txt
inference-engine/samples/benchmark_app/README.md
inference-engine/samples/benchmark_app/main.cpp
inference-engine/samples/benchmark_app/statistics_report.hpp
inference-engine/samples/calibration_tool/README.md
inference-engine/samples/calibration_tool/calibrator_processors.cpp
inference-engine/samples/calibration_tool/calibrator_processors.h
inference-engine/samples/calibration_tool/main.cpp
inference-engine/samples/classification_sample/README.md
inference-engine/samples/classification_sample_async/README.md
inference-engine/samples/hello_autoresize_classification/README.md
inference-engine/samples/hello_request_classification/README.md
inference-engine/samples/hello_shape_infer_ssd/README.md
inference-engine/samples/lenet_network_graph_builder/README.md
inference-engine/samples/object_detection_sample_ssd/README.md
inference-engine/samples/perfcheck/main.cpp
inference-engine/samples/sample_data/car.png [new file with mode: 0644]
inference-engine/samples/sample_data/squeezenet1.1.labels [new file with mode: 0644]
inference-engine/samples/speech_sample/main.cpp
inference-engine/samples/speech_sample/speech_sample.hpp
inference-engine/samples/style_transfer_sample/README.md
inference-engine/samples/thirdparty/gflags/doc/.nojekyll [new file with mode: 0644]
inference-engine/samples/thirdparty/gflags/doc/designstyle.css [new file with mode: 0644]
inference-engine/samples/thirdparty/gflags/doc/index.html [new file with mode: 0644]
inference-engine/samples/validation_app/README.md
inference-engine/samples/validation_app/image_decoder.cpp
inference-engine/samples/validation_app/main.cpp
inference-engine/src/CMakeLists.txt
inference-engine/src/cldnn_engine/cldnn_graph.cpp
inference-engine/src/cldnn_engine/cldnn_graph.h
inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
inference-engine/src/cldnn_engine/cldnn_infer_request.h
inference-engine/src/extension/ext_depth_to_space.cpp
inference-engine/src/extension/ext_detectionoutput_onnx.cpp
inference-engine/src/extension/ext_expand.cpp
inference-engine/src/extension/ext_fill.cpp
inference-engine/src/extension/ext_pad.cpp
inference-engine/src/extension/ext_priorgridgenerator_onnx.cpp
inference-engine/src/extension/ext_proposal.cpp
inference-engine/src/extension/ext_proposal_onnx.cpp
inference-engine/src/extension/ext_range.cpp
inference-engine/src/extension/ext_reverse_sequence.cpp
inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp
inference-engine/src/extension/ext_shuffle_channels.cpp
inference-engine/src/extension/ext_space_to_depth.cpp
inference-engine/src/extension/ext_squeeze.cpp
inference-engine/src/extension/ext_strided_slice.cpp
inference-engine/src/extension/ext_topkrois_onnx.cpp
inference-engine/src/extension/ext_unsqueeze.cpp
inference-engine/src/gna_plugin/dnn.cpp
inference-engine/src/gna_plugin/dnn.h
inference-engine/src/gna_plugin/dnn_memory.cpp
inference-engine/src/gna_plugin/dnn_memory.hpp
inference-engine/src/gna_plugin/dnn_traits.hpp
inference-engine/src/gna_plugin/floatmath.cpp
inference-engine/src/gna_plugin/gna_api_wrapper.hpp
inference-engine/src/gna_plugin/gna_device.cpp
inference-engine/src/gna_plugin/gna_helper.cpp
inference-engine/src/gna_plugin/gna_layer_info.hpp
inference-engine/src/gna_plugin/gna_mem_requests.hpp
inference-engine/src/gna_plugin/gna_model_serial.hpp
inference-engine/src/gna_plugin/gna_plugin.cpp
inference-engine/src/gna_plugin/gna_plugin.hpp
inference-engine/src/gna_plugin/gna_plugin_passes.cpp
inference-engine/src/gna_plugin/lstm.cpp
inference-engine/src/gna_plugin/pwl_design.cpp
inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp
inference-engine/src/gna_plugin/quantization/model_quantizer.hpp
inference-engine/src/gna_plugin/quantization/quantization.cpp
inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp
inference-engine/src/gna_plugin/util.cpp
inference-engine/src/hetero_plugin/fallback_policy.cpp
inference-engine/src/hetero_plugin/fallback_policy.h
inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_async_infer_request.h
inference-engine/src/hetero_plugin/hetero_device_loader.cpp
inference-engine/src/hetero_plugin/hetero_device_loader.h
inference-engine/src/hetero_plugin/hetero_executable_network.cpp
inference-engine/src/hetero_plugin/hetero_executable_network.h
inference-engine/src/hetero_plugin/hetero_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_infer_request.h
inference-engine/src/hetero_plugin/hetero_plugin.cpp
inference-engine/src/hetero_plugin/hetero_plugin.h
inference-engine/src/hetero_plugin/hetero_plugin_base.hpp
inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp
inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp
inference-engine/src/inference_engine/builders/ie_lrn_layer.cpp
inference-engine/src/inference_engine/builders/ie_network_builder.cpp
inference-engine/src/inference_engine/cnn_network_impl.cpp
inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp
inference-engine/src/inference_engine/dll_main.hpp
inference-engine/src/inference_engine/exec_graph_info.hpp
inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp
inference-engine/src/inference_engine/ie_context.cpp
inference-engine/src/inference_engine/ie_device.cpp
inference-engine/src/inference_engine/ie_layer_validators.cpp
inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp
inference-engine/src/inference_engine/ie_util_internal.cpp
inference-engine/src/inference_engine/ie_utils.cpp
inference-engine/src/inference_engine/net_pass.cpp
inference-engine/src/inference_engine/network_serializer.cpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_bin_conv_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_depth_to_space_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_expand_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_fill_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_quantize_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_range_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_reverse_sequence_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_shuffle_channels_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_space_to_depth_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_squeeze_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_strided_slice_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_tensor_iterator_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_unsqueeze_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_add_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_concat_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.cpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.cpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_div_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_fill_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_gather_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_in_place_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_mul_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_power_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_range_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_reshape_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_shape_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_split_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_strided_slice_const_infer.hpp
inference-engine/src/inference_engine/shape_infer/const_infer/ie_tile_const_infer.hpp
inference-engine/src/inference_engine/transform/transform_network.cpp
inference-engine/src/inference_engine/transform/transform_network.hpp
inference-engine/src/inference_engine/transform/transformation.cpp
inference-engine/src/inference_engine/transform/transformation.hpp
inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.cpp
inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.hpp
inference-engine/src/inference_engine/transform/transformations/lrn.cpp
inference-engine/src/inference_engine/transform/transformations/lrn.hpp
inference-engine/src/inference_engine/transform/transformations/sub.cpp
inference-engine/src/inference_engine/transform/transformations/sub.hpp
inference-engine/src/inference_engine/xml_parse_utils.cpp
inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
inference-engine/src/mkldnn_plugin/utils/blob_dump.h
inference-engine/src/vpu/CMakeLists.txt [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/CMakeLists.txt [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/allocator.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/allocator/structs.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/allocator_shaves.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_format.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_serializer.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/blob_reader.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/compile_env.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/custom_layer.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/frontend/frontend.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/frontend/parse_network.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/frontend/stage_builder.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/hw/mx_stage.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/hw/tiling.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/hw/utility.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/base.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_desc.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/edges.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/model.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/stage.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/network_config.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/parsed_config.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/pass_manager.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/private_plugin_config.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/stub_stage.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/sw/post_op_stage.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/sw/utility.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/any.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/attributes_map.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/auto_scope.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/checked_cast.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/containers.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/dot_io.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/enums.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/extra.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/file_system.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/func_ref.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/handle.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/ie_helpers.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/io.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/logger.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/numeric.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/optional.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/perf_report.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/range.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/simple_math.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/utils/string.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/allocator.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/allocator_shaves.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/backend/backend.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/backend/dump_to_dot.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/backend/get_meta_data.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/custom_layer.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/detect_network_batch.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/in_out_convert.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/parse_data.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/parse_network.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/pre_process.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/frontend/remove_const_layers.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/graph_transformer.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/hw/mx_stage.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/hw/tiling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/hw/utility.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_desc.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/model.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/stage.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/network_config.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/parsed_config.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/pass_manager.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_batch.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_layout.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_location.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/allocate_resources.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/eliminate_copy.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/final_check.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/finalize_hw_ops.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/find_subgraphs.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/hw_conv_tiling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/hw_fc_tiling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/hw_padding.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/hw_pooling_tiling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/inject_sw.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/merge_hw_stages.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/merge_relu_and_bias.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/process_special_stages.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/propagate_data_scale.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/replace_deconv_by_conv.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/replace_fc_by_conv.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/split_grouped_conv.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/split_hw_conv_and_pool.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/split_hw_depth_convolution.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/sw_conv_adaptation.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/sw_deconv_adaptation.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/sw_fc_adaptation.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/sw_pooling_adaptation.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/swap_concat_and_hw_ops.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/passes/weights_analysis.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/argmax.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/batch_norm.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/bias.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/clamp.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/concat.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/convolution.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/copy.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/crop.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/ctc_decoder.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/deconvolution.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/detection_output.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/elu.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/expand.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/fc.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/grn.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/interp.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/mtcnn.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/mvn.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/none.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/norm.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/normalize.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/pad.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/permute.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/pooling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/power.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/prelu.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/priorbox.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/priorbox_clustered.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/proposal.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/psroipooling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/region_yolo.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/relu.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/reorg_yolo.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/resample.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/reshape.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/rnn.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/roipooling.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/scale.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/shrink.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/softmax.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/split.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/tanh.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/tile.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stub_stage.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/sw/post_op_stage.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/sw/utility.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/dot_io.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/enums.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/file_system.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/ie_helpers.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/io.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/logger.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/perf_report.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/utils/simple_math.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/CMakeLists.txt [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.h [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_config.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_config.h [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_executor.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_executor.h [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp [new file with mode: 0644]
inference-engine/src/vpu/myriad_plugin/myriad_plugin.h [new file with mode: 0644]
inference-engine/tests/helpers/CMakeLists.txt
inference-engine/tests/helpers/ir_gen_helper.cpp
inference-engine/tests/helpers/test_model_repo.hpp.in
inference-engine/tests/mock_engine/dllmain.cpp
inference-engine/tests/mock_engine/stub_inference_engine.xpp
inference-engine/tests/unit/CMakeLists.txt
inference-engine/tests/unit/builders/argmax_layer_test.cpp
inference-engine/tests/unit/builders/clamp_layer_test.cpp
inference-engine/tests/unit/builders/concat_layer_test.cpp
inference-engine/tests/unit/builders/const_layer_test.cpp
inference-engine/tests/unit/builders/crop_layer_test.cpp
inference-engine/tests/unit/builders/ctc_greedy_decoder_layer_test.cpp
inference-engine/tests/unit/builders/detection_output_layer_test.cpp
inference-engine/tests/unit/builders/eltwise_layer_test.cpp
inference-engine/tests/unit/builders/elu_layer_test.cpp
inference-engine/tests/unit/builders/mvn_layer_test.cpp
inference-engine/tests/unit/builders/norm_layer_test.cpp
inference-engine/tests/unit/builders/normalize_layer_test.cpp
inference-engine/tests/unit/builders/output_layer_test.cpp
inference-engine/tests/unit/builders/relu6_layer_test.cpp
inference-engine/tests/unit/builders/relu_layer_test.cpp
inference-engine/tests/unit/builders/tanh_layer_test.cpp
inference-engine/tests/unit/builders/transform_network_test.cpp
inference-engine/tests/unit/cnn_network/cnn_layer_validation_tests.cpp
inference-engine/tests/unit/cnn_network/layer_builder.h
inference-engine/tests/unit/cnn_network/parameters.h
inference-engine/tests/unit/cnn_network/shapes.h
inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp
inference-engine/tests/unit/engines/gna/configuration_test.cpp
inference-engine/tests/unit/engines/gna/fp32_non_quantized_tests.cpp
inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp
inference-engine/tests/unit/engines/gna/gna_aminteldnn_test.cpp
inference-engine/tests/unit/engines/gna/gna_api_stub.cpp
inference-engine/tests/unit/engines/gna/gna_cppwraper_test.cpp
inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp
inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp
inference-engine/tests/unit/engines/gna/gna_input_precision_test.cpp
inference-engine/tests/unit/engines/gna/gna_matcher.cpp
inference-engine/tests/unit/engines/gna/gna_matcher.hpp
inference-engine/tests/unit/engines/gna/gna_memory_test.cpp
inference-engine/tests/unit/engines/gna/gna_mock_api.hpp
inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp
inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp
inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp
inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp
inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/fill_with_data.hpp
inference-engine/tests/unit/engines/gna/matchers/input_data_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp
inference-engine/tests/unit/engines/gna/matchers/weights_matcher.hpp
inference-engine/tests/unit/engines/gna/test_irs.cpp
inference-engine/tests/unit/engines/gna/test_irs.hpp
inference-engine/tests/unit/engines/mkldnn/dump_test.cpp
inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/depth_to_space_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/expand_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fill_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/range_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/reverse_sequence_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/shuffle_channels_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/squeeze_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/strided_slice_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/unsqueeze_tests.cpp
inference-engine/tests/unit/engines/vpu/adjust_data_location_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/containers_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/eliminate_copy_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/find_subgraphs_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/graph_transformer_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/graph_transformer_tests.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/graph_transformer_tests_constructs.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/mvnc/watchdog_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/range_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/vpu/replace_deconv_by_conv_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/inference_engine_tests/device_tests.cpp
inference-engine/tests/unit/inference_engine_tests/normalization/latest_in_fuse_test.cpp
inference-engine/tests/unit/inference_engine_tests/parameter_tests.cpp
inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp
inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt
inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/CMakeLists.txt
inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.cpp
inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.hpp
inference-engine/tests/unit/samples/config_register.cpp [new file with mode: 0644]
inference-engine/tests/unit/samples/samples_core.cpp [new file with mode: 0644]
inference-engine/tests/unit/shape_infer/adult_test.hpp
inference-engine/tests/unit/shape_infer/adult_test_utils.hpp
inference-engine/tests/unit/transformations/eltwise_broadcast_test.cpp
inference-engine/tests/unit/transformations/sub_test.cpp
inference-engine/tests/validation_app/CMakeLists.txt
inference-engine/thirdparty/clDNN/docs/img/bf8_xy16.jpg
inference-engine/thirdparty/clDNN/docs/img/bfyx.jpg
inference-engine/thirdparty/clDNN/docs/img/bs_x_bsv16.jpg
inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv16.jpg
inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv8.jpg
inference-engine/thirdparty/clDNN/docs/img/byxf.jpg
inference-engine/thirdparty/clDNN/docs/img/crop_no_offset.jpg
inference-engine/thirdparty/clDNN/docs/img/crop_w_offset.jpg
inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c1_b_fyx.jpg
inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c4_fyx_b.jpg
inference-engine/thirdparty/clDNN/docs/img/layout_memory_representation.jpg
inference-engine/thirdparty/clDNN/docs/img/os_iyx_osv16.jpg
inference-engine/thirdparty/clDNN/docs/img/workflow.jpg
inference-engine/thirdparty/clDNN/docs/img/yxfb.jpg
inference-engine/thirdparty/clDNN/utils/codegen/generate_api_wrappers.py
inference-engine/thirdparty/mkl-dnn/include/mkldnn.h
inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp
inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h
inference-engine/thirdparty/mkl-dnn/src/common/binarization.cpp
inference-engine/thirdparty/mkl-dnn/src/common/binarization_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binarization_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binarization.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_binarization_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_depthwise_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_binarization_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_depthwise_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_eltwise_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward_common.hpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_sum_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_eltwise_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward_common.hpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_sum_forward.cpp
inference-engine/thirdparty/mkldnn.cmake
inference-engine/thirdparty/movidius/CMakeLists.txt [new file with mode: 0644]
inference-engine/thirdparty/movidius/MovidiusDriver/Movidius_VSC_Device.inf [new file with mode: 0644]
inference-engine/thirdparty/movidius/MovidiusDriver/amd64/WdfCoInstaller01011.dll [new file with mode: 0644]
inference-engine/thirdparty/movidius/MovidiusDriver/amd64/winusbcoinstaller2.dll [new file with mode: 0644]
inference-engine/thirdparty/movidius/MovidiusDriver/movidius_vsc_device.cat [new file with mode: 0644]
inference-engine/thirdparty/movidius/USB_WIN/gettime.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/USB_WIN/gettime.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/WinPthread/win_pthread.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/WinPthread/win_pthread.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/WinPthread/win_semaphore.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/WinPthread/win_semaphore.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/CMakeLists.txt [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/XLinkPlatform.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/pcie_host.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/pcie_host.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/usb_boot.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/usb_boot.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLink.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLink.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkPlatform.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkPrivateDefines.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkPublicDefines.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/shared/XLinkVersion.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/CMakeLists.txt [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/include/mvnc.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/include/mvnc_ext.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/include/ncCommPrivate.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/include/ncPrivateTypes.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/src/97-myriad-usbboot.rules [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/shared/include/mvLog.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/shared/include/mvMacros.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/watchdog/watchdog.cpp [new file with mode: 0644]
inference-engine/thirdparty/movidius/watchdog/watchdog.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/watchdog/watchdogPrivate.hpp [new file with mode: 0644]
inference-engine/tools/CMakeLists.txt [new file with mode: 0644]
inference-engine/tools/calibration_tool/README.md
inference-engine/tools/vpu/CMakeLists.txt [new file with mode: 0644]
inference-engine/tools/vpu/common/vpu_tools_common.cpp [new file with mode: 0644]
inference-engine/tools/vpu/common/vpu_tools_common.hpp [new file with mode: 0644]
inference-engine/tools/vpu/vpu_compile/CMakeLists.txt [new file with mode: 0644]
inference-engine/tools/vpu/vpu_compile/README.md [new file with mode: 0644]
inference-engine/tools/vpu/vpu_compile/main.cpp [new file with mode: 0644]
inference-engine/tools/vpu/vpu_profile/CMakeLists.txt [new file with mode: 0644]
inference-engine/tools/vpu/vpu_profile/README.md [new file with mode: 0644]
inference-engine/tools/vpu/vpu_profile/main.cpp [new file with mode: 0644]
model-optimizer/extensions/middle/MulQuantizeFuse.py
model-optimizer/extensions/middle/MulQuantizeFuse_test.py [new file with mode: 0644]

index a082023..9270518 100644 (file)
--- a/README.md
+++ b/README.md
@@ -15,7 +15,12 @@ Deep Learning Deployment Toolkit is licensed under [Apache License Version 2.0](
 
 ## Documentation
 * [OpenVINOâ„¢ Release Notes](https://software.intel.com/en-us/articles/OpenVINO-RelNotes)
-* Inference Engine [build instructions](inference-engine/README.md)
+* [Inference Engine build instructions](inference-engine/README.md)
+* [Get Started with Deep Learning Deployment Toolkit on Linux*](get-started-linux.md)
+* [Introduction to Deep Learning Deployment Toolkit](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Introduction.html)
+* [Inference Engine Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Deep_Learning_Inference_Engine_DevGuide.html)
+* [Model Optimizer Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html)
+
 
 ## How to Contribute
 We welcome community contributions to the Deep Learning Deployment Toolkit repository. If you have an idea how to improve the product, please share it with us doing the following steps:
diff --git a/get-started-linux.md b/get-started-linux.md
new file mode 100644 (file)
index 0000000..4d481c2
--- /dev/null
@@ -0,0 +1,203 @@
+# Get Started with OpenVINOâ„¢ Deep Learning Deployment Toolkit (DLDT) on Linux*
+
+This guide provides you with the information that will help you to start using the DLDT on Linux*. With this guide you will learn how to:
+
+1. [Configure the Model Optimizer](#configure-the-model-optimizer)
+2. [Prepare a model for sample inference:](#prepare-a-model-for-sample-inference)
+   1. [Download a pre-trained model](#download-a-trained-model)
+   2. [Convert the model to an Intermediate Representation (IR) with the Model Optimizer](#convert-the-model-to-an-intermediate-representation-with-the-model-optimizer)
+3. [Run the Image Classification Sample Application with the model](#run-the-image-classification-sample-application)
+
+## Prerequisites
+1. This guide assumes that you have already cloned the `dldt` repo and successfully built the Inference Engine and Samples using the [build instructions](inference-engine/README.md). 
+2. The original structure of the repository directories is kept unchanged.
+
+> **NOTE**: Below, the directory to which the `dldt` repository is cloned is referred to as `<DLDT_DIR>`.  
+
+## Configure the Model Optimizer
+
+The Model Optimizer is a Python\*-based command line tool for importing trained models from popular deep learning frameworks such as Caffe\*, TensorFlow\*, Apache MXNet\*, ONNX\* and Kaldi\*.
+
+You cannot perform inference on your trained model without running the model through the Model Optimizer. When you run a pre-trained model through the Model Optimizer, your output is an Intermediate Representation (IR) of the network. The Intermediate Representation is a pair of files that describe the whole model:
+
+- `.xml`: Describes the network topology
+- `.bin`: Contains the weights and biases binary data
+
+For more information about the Model Optimizer, refer to the [Model Optimizer Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html). 
+
+### Model Optimizer Configuration Steps
+
+You can choose to either configure all supported frameworks at once **OR** configure one framework at a time. Choose the option that best suits your needs. If you see error messages, make sure you installed all dependencies.
+
+> **NOTE**: Since the TensorFlow framework is not officially supported on CentOS*, the Model Optimizer for TensorFlow can't be configured and ran on those systems.  
+
+> **IMPORTANT**: The Internet access is required to execute the following steps successfully. If you have access to the Internet through the proxy server only, please make sure that it is configured in your OS environment.
+
+**Option 1: Configure all supported frameworks at the same time**
+
+1.  Go to the Model Optimizer prerequisites directory:
+```sh
+cd <DLDT_DIR>/model_optimizer/install_prerequisites
+```
+2.  Run the script to configure the Model Optimizer for Caffe,
+    TensorFlow, MXNet, Kaldi\*, and ONNX:
+```sh
+sudo ./install_prerequisites.sh
+```
+
+**Option 2: Configure each framework separately**
+
+Configure individual frameworks separately **ONLY** if you did not select **Option 1** above.
+
+1.  Go to the Model Optimizer prerequisites directory:
+```sh
+cd <DLDT_DIR>/model_optimizer/install_prerequisites
+```
+2.  Run the script for your model framework. You can run more than one script:
+
+   - For **Caffe**:
+   ```sh
+   sudo ./install_prerequisites_caffe.sh
+   ```
+
+   - For **TensorFlow**:
+   ```sh
+   sudo ./install_prerequisites_tf.sh
+   ```
+
+   - For **MXNet**:
+   ```sh
+   sudo ./install_prerequisites_mxnet.sh
+   ```
+
+   - For **ONNX**:
+   ```sh
+   sudo ./install_prerequisites_onnx.sh
+   ```
+
+   - For **Kaldi**:
+   ```sh
+   sudo ./install_prerequisites_kaldi.sh
+   ```
+The Model Optimizer is configured for one or more frameworks. Continue to the next session to download and prepare a model for running a sample inference.
+
+## Prepare a Model for Sample Inference
+
+This paragraph contains the steps to get the pre-trained model for sample inference and to prepare the model's optimized Intermediate Representation that Inference Engine uses.
+
+### Download a Trained Model
+
+To run the Image Classification Sample you'll need a pre-trained model to run the inference on. This guide will use the public SqueezeNet 1.1 Caffe* model. You can find and download this model manually or use the OpenVINOâ„¢ [Model Downloader](https://github.com/opencv/open_model_zoo/tree/master/model_downloader). 
+
+With the Model Downloader, you can download other popular public deep learning topologies and the [OpenVINOâ„¢ pre-trained models](https://github.com/opencv/open_model_zoo/tree/master/intel_models) prepared for running inference for a wide list of inference scenarios: object detection, object recognition, object re-identification, human pose estimation, action recognition and others.
+
+To download the SqueezeNet 1.1 Caffe* model to a models folder with the Model Downloader:
+1. Install the [prerequisites](https://github.com/opencv/open_model_zoo/tree/master/model_downloader#prerequisites).
+2. Run the `downloader.py` with specifying the topology name and a `<models_dir>` path. For example to download the model to the `~/public_models` directory:
+   ```sh
+   ./downloader.py --name squeezenet1.1 --output_dir ~/public_models
+   ```
+   When the model files are successfully downloaded the output similar to the following is printed:
+   ```sh
+   ###############|| Downloading topologies ||###############
+
+   ========= Downloading /home/username/public_models/classification/squeezenet/1.1/caffe/squeezenet1.1.prototxt
+   
+   ========= Downloading /home/username/public_models/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel
+   ... 100%, 4834 KB, 3157 KB/s, 1 seconds passed
+
+   ###############|| Post processing ||###############
+
+   ========= Changing input dimensions in squeezenet1.1.prototxt =========
+   ```
+
+### Convert the model to an Intermediate Representation with the Model Optimizer
+
+> **NOTE**: This section assumes that you have configured the Model Optimizer using the instructions from the [Configure the Model Optimizer](#configure-the-model-optimizer) section.
+
+1. Create a `<ir_dir>` directory that will contains the Intermediate Representation (IR) of the model. 
+
+2. Inference Engine can perform inference on a [list of supported devices](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_supported_plugins_Supported_Devices.html) using specific device plugins. Different plugins support models of [different precision formats](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_supported_plugins_Supported_Devices.html#supported_model_formats), such as FP32, FP16, INT8. To prepare an IR to run inference on a particular hardware, run the Model Optimizer with the appropriate `--data_type` options:
+
+   **For CPU (FP32):**
+   ```sh  
+   python3 <DLDT_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP32 --output_dir <ir_dir>
+   ```
+
+   **For GPU and MYRIAD (FP16):**
+   ```sh  
+   python3 <DLDT_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP16 --output_dir <ir_dir>
+   ``` 
+   After the Model Optimizer script is completed, the produced IR files (`squeezenet1.1.xml`, `squeezenet1.1.bin`) are in the specified `<ir_dir>` directory.
+
+3. Copy the `squeezenet1.1.labels` file from the `<DLDT_DIR>/inference-engine/samples/sample_data/` to the model IR directory. This file contains the classes that ImageNet uses so that the inference results show text instead of classification numbers:
+   ```sh
+   cp <DLDT_DIR>/inference-engine/samples/sample_data/squeezenet1.1.labels <ir_dir>
+   ```
+
+Now you are ready to run the Image Classification Sample Application.
+
+## Run the Image Classification Sample Application
+
+The Inference Engine sample applications are automatically compiled when you built the Inference Engine using the [build instructions](inference-engine/README.md). The binary files are located in the `<DLDT_DIR>/inference-engine/bin/intel64/Release` directory.
+
+Follow the steps below to run the Image Classification sample application on the prepared IR and with an input image: 
+
+1. Go to the samples build directory:
+   ```sh
+   cd <DLDT_DIR>/inference-engine/bin/intel64/Release
+   ```
+2. Run the sample executable with specifying the `car.png` file from the `<DLDT_DIR>/inference-engine/samples/sample_data/` directory as an input image, the IR of your model and a plugin for a hardware device to perform inference on:
+
+   **For CPU:**
+   ```sh
+   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d CPU
+   ```
+
+   **For GPU:**
+   ```sh
+   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d GPU
+   ```
+   
+   **For MYRIAD:** 
+   >**NOTE**: Running inference on VPU devices (Intel® Movidiusâ„¢ Neural Compute Stick or Intel® Neural Compute Stick 2) with the MYRIAD plugin requires performing [additional hardware configuration steps](inference-engine/README.md#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2).
+   ```sh
+   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d MYRIAD
+   ```
+
+When the Sample Application completes, you will have the label and confidence for the top-10 categories printed on the screen. Below is a sample output with inference results on CPU:    
+```sh
+Top 10 results:
+
+Image /home/user/dldt/inference-engine/samples/sample_data/car.png
+
+classid probability label
+------- ----------- -----
+817     0.8363345   sports car, sport car
+511     0.0946488   convertible
+479     0.0419131   car wheel
+751     0.0091071   racer, race car, racing car
+436     0.0068161   beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+656     0.0037564   minivan
+586     0.0025741   half track
+717     0.0016069   pickup, pickup truck
+864     0.0012027   tow truck, tow car, wrecker
+581     0.0005882   grille, radiator grille
+
+
+total inference time: 2.6642941
+Average running time of one iteration: 2.6642941 ms
+
+Throughput: 375.3339402 FPS
+
+[ INFO ] Execution successful
+```
+
+## Additional Resources
+
+* [OpenVINOâ„¢ Release Notes](https://software.intel.com/en-us/articles/OpenVINO-RelNotes)
+* [Inference Engine build instructions](inference-engine/README.md)
+* [Introduction to Intel® Deep Learning Deployment Toolkit](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Introduction.html)
+* [Inference Engine Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Deep_Learning_Inference_Engine_DevGuide.html)
+* [Model Optimizer Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html)
+* [Inference Engine Samples Overview](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Samples_Overview.html). 
index 9e639ff..1c3d6ea 100644 (file)
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
 
 project(InferenceEngine)
 
index 36053cd..2bbd7e8 100644 (file)
@@ -1,5 +1,34 @@
-## Repository components
-
+# Build Inference Engine
+
+## Contents
+
+- [Introduction](#introduction)
+- [Build on Linux* Systems](#build-on-linux-systems)
+  - [Software Requirements](#software-requirements)
+  - [Build Steps](#build-steps)
+  - [Additional Build Options](#additional-build-options)
+- [Build for Raspbian* Stretch OS](#build-for-raspbian-stretch-os)
+  - [Hardware Requirements](#hardware-requirements)
+  - [Native Compilation](#native-compilation)
+  - [Cross Compilation Using Docker*](#cross-compilation-using-docker)
+  - [Additional Build Options](#additional-build-options-1)
+- [Build on Windows* Systems](#build-on-windows-systems)
+  - [Software Requirements](#software-requirements-1)
+  - [Build Steps](#build-steps-1)
+  - [Additional Build Options](#additional-build-options-2)
+  - [Building Inference Engine with Ninja* Build System](#building-inference-engine-with-ninja-build-system)
+- [Build on macOS* Systems](#build-on-macos-systems)
+  - [Software Requirements](#software-requirements-2)
+  - [Build Steps](#build-steps-2)
+  - [Additional Build Options](#additional-build-options-3)
+- [Use Custom OpenCV Builds for Inference Engine](#use-custom-opencv-builds-for-inference-engine)
+- [(Optional) Additional Installation Steps for the Intel® Movidiusâ„¢ Neural Compute Stick and Neural Compute Stick 2](#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2)
+  - [For Linux, Raspbian Stretch* OS](#for-linux-raspbian-stretch-os)
+  - [For Windows](#for-windows-1)
+- [Next Steps](#next-steps)
+- [Additional Resources](#additional-resources)
+
+## Introduction
 The Inference Engine can infer models in different formats with various input and output formats.
 
 The open source version of Inference Engine includes the following plugins:
@@ -9,21 +38,22 @@ The open source version of Inference Engine includes the following plugins:
 | CPU plugin           | Intel® Xeon® with Intel® AVX2 and AVX512, Intel® Coreâ„¢ Processors with Intel® AVX2, Intel® Atom® Processors with Intel® SSE |
 | GPU plugin           | Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics |
 | GNA plugin           | Intel® Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel® Pentium® Silver processor J5005, Intel® Celeron® processor J4005, Intel® Coreâ„¢ i3-8121U processor |
+| MYRIAD plugin        | Intel® Movidiusâ„¢ Neural Compute Stick powered by the Intel® Movidiusâ„¢ Myriadâ„¢ 2, Intel® Neural Compute Stick 2 powered by the Intel® Movidiusâ„¢ Myriadâ„¢ X |
 | Heterogeneous plugin | Heterogeneous plugin enables computing for inference on one network on several Intel® devices. |
 
-Inference Engine plugins for Intel® FPGA and Intel® Movidiusâ„¢ Neural Compute Stick are distributed only in a binary form as a part of [Intel® Distribution of OpenVINOâ„¢](https://software.intel.com/en-us/openvino-toolkit).
+Inference Engine plugin for Intel® FPGA is distributed only in a binary form as a part of [Intel® Distribution of OpenVINOâ„¢](https://software.intel.com/en-us/openvino-toolkit).
 
-## Build on Linux\* Systems
+## Build on Linux* Systems
 
 The software was validated on:
 - Ubuntu\* 16.04 (64-bit) with default GCC\* 5.4.0
 - CentOS\* 7.4 (64-bit) with default GCC\* 4.8.5
-- [Intel® Graphics Compute Runtime for OpenCLâ„¢ Driver package 18.28.11080](https://github.com/intel/compute-runtime/releases/tag/18.28.11080).
 
 ### Software Requirements
-- [CMake\*](https://cmake.org/download/) 3.9 or higher
+- [CMake\*](https://cmake.org/download/) 3.5 or higher
 - GCC\* 4.8 or higher to build the Inference Engine
 - Python 2.7 or higher for Inference Engine Python API wrapper
+- (Optional) [Install Intel® Graphics Compute Runtime for OpenCLâ„¢ Driver package 19.04.12237](https://github.com/intel/compute-runtime/releases/tag/19.04.12237).
 
 ### Build Steps
 1. Clone submodules:
@@ -33,34 +63,42 @@ The software was validated on:
     git submodule update --recursive
     ```
 2. Install build dependencies using the `install_dependencies.sh` script in the project root folder.
-3. Create a build folder:
+3. By default, the build enables the Inference Engine GPU plugin to infer models on your Intel® Processor Graphics. This requires you to [Install Intel® Graphics Compute Runtime for OpenCLâ„¢ Driver package 19.04.12237](https://github.com/intel/compute-runtime/releases/tag/19.04.12237) before running the build. If you don't want to use the GPU plugin, use the `-DENABLE_CLDNN=ON` CMake build option and skip the installation of the Intel® Graphics Compute Runtime for OpenCLâ„¢ Driver.
+4. Create a build folder:
 ```sh
-  mkdir build
+  mkdir build && cd build
 ```
-4. Inference Engine uses a CMake-based build system. In the created `build` directory, run `cmake` to fetch project dependencies and create Unix makefiles, then run `make` to build the project:
+5. Inference Engine uses a CMake-based build system. In the created `build` directory, run `cmake` to fetch project dependencies and create Unix makefiles, then run `make` to build the project:
 ```sh
   cmake -DCMAKE_BUILD_TYPE=Release ..
-  make -j16
+  make --jobs=$(nproc --all)
 ```
+
+### Additional Build Options
+
 You can use the following additional build options:
+
 - Internal JIT GEMM implementation is used by default.
-- To switch to OpenBLAS\* implementation, use `GEMM=OPENBLAS` option and `BLAS_INCLUDE_DIRS` and `BLAS_LIBRARIES` cmake options to specify path to OpenBLAS headers and library, for example use the following options on CentOS\*: `-DGEMM=OPENBLAS -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DBLAS_LIBRARIES=/usr/lib64/libopenblas.so.0`
 
-- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=<path_to_MKL>` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_lnx_2019.0.1.20180928.tgz)
+- To switch to OpenBLAS\* implementation, use the `GEMM=OPENBLAS` option and `BLAS_INCLUDE_DIRS` and `BLAS_LIBRARIES` CMake options to specify path to the OpenBLAS headers and library. For example use the following options on CentOS\*: `-DGEMM=OPENBLAS -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DBLAS_LIBRARIES=/usr/lib64/libopenblas.so.0`.
+
+- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=<path_to_MKL>` CMake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded from the [MKL-DNN repository](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_lnx_2019.0.1.20180928.tgz).
 
 - Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option.
 
-- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed.
+- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you want to use the automatically downloaded packages but you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed. 
 
-- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options:
-```sh
-  -DPYTHON_EXECUTABLE=`which python3.7` \
-  -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.7m.so \
-  -DPYTHON_INCLUDE_DIR=/usr/include/python3.7
-```
-
-- To switch on/off the CPU and GPU plugins, use `cmake` options `-DENABLE_MKL_DNN=ON/OFF` and `-DENABLE_CLDNN=ON/OFF`.
+- If the CMake-based build script can not find and download the OpenCV package that is supported on your platform, or if you want to use a custom build of the OpenCV library, refer to the [Use Custom OpenCV Builds](#use-custom-opencv-builds-for-inference-engine) section for details. 
 
+- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options:
+   ```sh
+   -DPYTHON_EXECUTABLE=`which python3.7` \
+   -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.7m.so \
+   -DPYTHON_INCLUDE_DIR=/usr/include/python3.7
+   ```
+
+- To switch off/on the CPU and GPU plugins, use the `cmake` options `-DENABLE_MKL_DNN=ON/OFF` and `-DENABLE_CLDNN=ON/OFF` respectively. 
+  
 5. Adding to your project
 
     For CMake projects, set an environment variable `InferenceEngine_DIR`:
@@ -79,16 +117,179 @@ You can use the following additional build options:
     target_link_libraries(${PROJECT_NAME} ${InferenceEngine_LIBRARIES} dl)
     ```
 
-## Build on Windows\* Systems:
+## Build for Raspbian Stretch* OS
+
+> **NOTE**: Only the MYRIAD plugin is supported.
+
+### Hardware Requirements
+* Raspberry Pi\* 2 or 3 with Raspbian\* Stretch OS (32-bit). Check that it's CPU supports ARMv7 instruction set (`uname -m` command returns `armv7l`).
+
+  > **NOTE**: Despite the Raspberry Pi\* CPU is ARMv8, 32-bit OS detects ARMv7 CPU instruction set. The default `gcc` compiler applies ARMv6 architecture flag for compatibility with lower versions of boards. For more information, run the `gcc -Q --help=target` command and refer to the description of the `-march=` option.
+
+You can compile the Inference Engine for Raspberry Pi\* in one of the two ways:
+* [Native Compilation](#native-compilation), which is the simplest way, but time-consuming
+* [Cross Compilation Using Docker*](#cross-compilation-using-docker), which is the recommended way
+
+### Native Compilation
+Native compilation of the Inference Engine is the most straightforward solution. However, it might take at least one hour to complete on Raspberry Pi\* 3.
+
+1. Install dependencies:
+
+  ```bash
+  sudo apt-get update
+  sudo apt-get install -y git cmake libusb-1.0-0-dev
+  ```
+
+2. Go to the `inference-engine` directory of the cloned `dldt` repository:
+
+  ```bash
+  cd dldt/inference-engine
+  ```
+
+3. Initialize submodules:
+
+  ```bash
+  git submodule init
+  git submodule update --recursive
+  ```
+
+4. Create a build folder:
+
+  ```bash
+  mkdir build && cd build
+  ```
+
+5. Build the Inference Engine:
+
+  ```bash
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DENABLE_SSE42=OFF \
+        -DTHREADING=SEQ \
+        -DENABLE_GNA=OFF .. && make -j2
+  ```
+
+### Cross Compilation Using Docker*
+
+  This compilation was tested on the following configuration:
+
+  * Host: Ubuntu\* 16.04 (64-bit, Intel® Coreâ„¢ i7-6700K CPU @ 4.00GHz Ã— 8)
+  * Target: Raspbian\* Stretch (32-bit, ARMv7, Raspberry Pi\* 3)
+
+1. Install Docker\*:
+
+  ```bash
+  sudo apt-get install -y docker.io
+  ```
+
+2. Add a current user to `docker` group:
+
+  ```bash
+  sudo usermod -a -G docker $USER
+  ```
+
+  Log out and log in for this to take effect.
+
+3. Create a directory named `ie_cross_armhf` and add a text file named `Dockerfile`
+with the following content:
+
+  ```docker
+  FROM debian:stretch
+
+  USER root
+
+  RUN dpkg --add-architecture armhf && \
+      apt-get update && \
+      apt-get install -y --no-install-recommends \
+      build-essential \
+      crossbuild-essential-armhf \
+      git \
+      wget \
+      libusb-1.0-0-dev:armhf \
+      libgtk-3-dev:armhf \
+      libavcodec-dev:armhf \
+      libavformat-dev:armhf \
+      libswscale-dev:armhf \
+      libgstreamer1.0-dev:armhf \
+      libgstreamer-plugins-base1.0-dev:armhf \
+      libpython3-dev:armhf \
+      python3-pip
+      
+  RUN wget https://www.cmake.org/files/v3.14/cmake-3.14.3.tar.gz && \
+      tar xf cmake-3.14.3.tar.gz && \
+      (cd cmake-3.14.3 && ./bootstrap --parallel=$(nproc --all) && make --jobs=$(nproc --all) && make install) && \
+      rm -rf cmake-3.14.3 cmake-3.14.3.tar.gz
+
+  ```
+
+  It uses the Debian\* Stretch (Debian 9) OS for compilation because it is a base of the Raspbian\* Stretch.
+
+4. Build a Docker\* image:
+
+  ```bash
+  docker image build -t ie_cross_armhf ie_cross_armhf
+  ```
+
+5. Run Docker\* container with mounted source code folder from host:
+
+  ```bash
+  docker run -it -v /absolute/path/to/dldt:/dldt ie_cross_armhf /bin/bash
+  ```
+
+6. While in the container:
+
+    1. Go to the `inference-engine` directory of the cloned `dldt` repository:
+
+      ```bash
+      cd dldt/inference-engine
+      ```
+
+    2. Create a build folder:
+
+      ```bash
+      mkdir build && cd build
+      ```
+
+    3. Build the Inference Engine:
+
+      ```bash
+      cmake -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="../cmake/arm.toolchain.cmake" \
+          -DTHREADS_PTHREAD_ARG="-pthread" \
+          -DENABLE_SSE42=OFF \
+          -DTHREADING=SEQ \
+          -DENABLE_GNA=OFF .. && make --jobs=$(nproc --all)
+      ```
+
+7. Press "Ctrl"+"D" to exit from Docker\*. You can find the resulting binaries in the `dldt/inference-engine/bin/armv7l/` directory and the OpenCV* installation in the `dldt/inference-engine/temp`.
+   
+>**NOTE**: Native applications that link to cross-compiled Inference Engine library require an extra compilation flag `-march=armv7-a`.
+
+### Additional Build Options
+
+You can use the following additional build options:
+
+- Required versions of OpenCV packages are downloaded automatically by the CMake-based script. If you want to use the automatically downloaded packages but you already have installed OpenCV packages configured in your environment, you may need to clean the `OpenCV_DIR` environment variable before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed. 
+
+- If the CMake-based build script can not find and download the OpenCV package that is supported on your platform, or if you want to use a custom build of the OpenCV library, refer to the [Use Custom OpenCV Builds](#use-custom-opencv-builds-for-inference-engine) section for details.
+
+- To build Python API wrapper, install `libpython3-dev:armhf` and `python3-pip` packages using `apt-get`, then install `numpy` and `cython` python modules using `pip3` command and add the following cmake options:
+```sh
+  -DENABLE_PYTHON=ON \
+  -DPYTHON_EXECUTABLE=/usr/bin/python3.5 \
+  -DPYTHON_LIBRARY=/usr/lib/arm-linux-gnueabihf/libpython3.5m.so \
+  -DPYTHON_INCLUDE_DIR=/usr/include/python3.5
+```
+
+## Build on Windows* Systems
 
 The software was validated on:
 - Microsoft\* Windows\* 10 (64-bit) with Visual Studio 2017 and Intel® C++ Compiler 2018 Update 3
-- [Intel® Graphics Driver for Windows* [24.20] driver package](https://downloadcenter.intel.com/download/27803/Graphics-Intel-Graphics-Driver-for-Windows-10?v=t).
 
 ### Software Requirements
-- [CMake\*](https://cmake.org/download/) 3.9 or higher
+- [CMake\*](https://cmake.org/download/) 3.5 or higher
 - [OpenBLAS\*](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download) and [mingw64\* runtime dependencies](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download).
 - [Intel® C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) 18.0 to build the Inference Engine on Windows.
+- (Optional) [Intel® Graphics Driver for Windows* [25.20] driver package](https://downloadcenter.intel.com/download/28646/Intel-Graphics-Windows-10-DCH-Drivers?product=80939).
 - Python 3.4 or higher for Inference Engine Python API wrapper
 
 ### Build Steps
@@ -101,11 +302,12 @@ The software was validated on:
 3. Install OpenBLAS:
     1. Download [OpenBLAS\*](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download)
     2. Unzip the downloaded package to a directory on your machine. In this document, this directory is referred to as `<OPENBLAS_DIR>`.
-4. Create build directory:
+4. By default, the build enables the Inference Engine GPU plugin to infer models on your Intel® Processor Graphics. This requires you to [download and install the Intel® Graphics Driver for Windows* [25.20] driver package](https://downloadcenter.intel.com/download/28646/Intel-Graphics-Windows-10-DCH-Drivers?product=80939) before running the build. If you don't want to use the GPU plugin, use the `-DENABLE_CLDNN=ON` CMake build option and skip the installation of the Intel® Graphics Driver.    
+5. Create build directory:
     ```sh
     mkdir build
     ```
-5. In the `build` directory, run `cmake` to fetch project dependencies and generate a Visual Studio solution:
+6. In the `build` directory, run `cmake` to fetch project dependencies and generate a Visual Studio solution:
 ```sh
 cd build
 cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
@@ -113,26 +315,32 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
     -DICCLIB="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\compiler\lib" ..
 ```
 
+7. Build generated solution in Visual Studio 2017 or run `cmake --build . --config Release` to build from the command line.
+
+8. Before running the samples, add paths to TBB and OpenCV binaries used for the build to the `%PATH%` environment variable. By default, TBB binaries are downloaded by the CMake-based script to the `<dldt_repo>/inference-engine/temp/tbb/lib` folder, OpenCV binaries - to the `<dldt_repo>/inference-engine/temp/opencv_4.1.0/bin` folder.
+
+### Additional Build Options
+
 - Internal JIT GEMM implementation is used by default.
-- To switch to OpenBLAS GEMM implementation, use -DGEMM=OPENBLAS cmake option and specify path to OpenBLAS using `-DBLAS_INCLUDE_DIRS=<OPENBLAS_DIR>\include` and `-DBLAS_LIBRARIES=<OPENBLAS_DIR>\lib\libopenblas.dll.a` options. Prebuilt OpenBLAS\* package can be downloaded [here](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download), mingw64* runtime dependencies [here](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download)
-- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=<path_to_MKL>` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_win_2019.0.1.20180928.zip)
+- To switch to OpenBLAS GEMM implementation, use the `-DGEMM=OPENBLAS` CMake option and specify path to OpenBLAS using the `-DBLAS_INCLUDE_DIRS=<OPENBLAS_DIR>\include` and `-DBLAS_LIBRARIES=<OPENBLAS_DIR>\lib\libopenblas.dll.a` options. Prebuilt OpenBLAS\* package can be downloaded [here](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download). mingw64* runtime dependencies can be downloaded [here](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download).
+- To switch to the optimized MKL-ML\* GEMM implementation, use the `-DGEMM=MKL` and `-DMKLROOT=<path_to_MKL>` CMake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded from the [MKL-DNN repository](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_win_2019.0.1.20180928.zip).
 
 - Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option.
 
-- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed.
+- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you want to use the automatically downloaded packages but you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed.
 
-- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options:
-```sh
-  -DPYTHON_EXECUTABLE="C:\Program Files\Python37\python.exe" ^
-  -DPYTHON_LIBRARY="C:\Program Files\Python37\libs\python37.lib" ^
-  -DPYTHON_INCLUDE_DIR="C:\Program Files\Python37\include"
-```
+- If the CMake-based build script can not find and download the OpenCV package that is supported on your platform, or if you want to use a custom build of the OpenCV library, refer to the [Use Custom OpenCV Builds](#use-custom-opencv-builds-for-inference-engine) section for details.
 
-6. Build generated solution in Visual Studio 2017 or run `cmake --build . --config Release` to build from the command line.
+- To switch off/on the CPU and GPU plugins, use the `cmake` options `-DENABLE_MKL_DNN=ON/OFF` and `-DENABLE_CLDNN=ON/OFF` respectively.
 
-7. Before running the samples, add paths to TBB and OpenCV binaries used for the build to the %PATH% environment variable. By default, TBB binaries are downloaded by the CMake-based script to the `<dldt_repo>/inference-engine/temp/tbb/lib` folder, OpenCV binaries - to the `<dldt_repo>/inference-engine/temp/opencv_4.1.0/bin` folder.
+- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options:
+   ```sh
+   -DPYTHON_EXECUTABLE="C:\Program Files\Python37\python.exe" ^
+   -DPYTHON_LIBRARY="C:\Program Files\Python37\libs\python37.lib" ^
+   -DPYTHON_INCLUDE_DIR="C:\Program Files\Python37\include"
+   ```
 
-### Building Inference Engine with Ninja
+### Building Inference Engine with Ninja* Build System
 
 ```sh
 call "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\bin\ipsxe-comp-vars.bat" intel64 vs2017
@@ -144,13 +352,15 @@ cmake -G Ninja -Wno-dev -DCMAKE_BUILD_TYPE=Release ..
 cmake --build . --config Release
 ```
 
-## Build on macOS\* Systems
+## Build on macOS* Systems
+
+> **NOTE**: The current version of the OpenVINOâ„¢ toolkit for macOS* supports inference on Intel CPUs only.
 
 The software was validated on:
 - macOS\* 10.14, 64-bit
 
 ### Software Requirements
-- [CMake\*](https://cmake.org/download/) 3.9 or higher
+- [CMake\*](https://cmake.org/download/) 3.5 or higher
 - Clang\* compiler from Xcode\* 10.1
 - Python\* 3.4 or higher for the Inference Engine Python API wrapper
 
@@ -169,14 +379,20 @@ The software was validated on:
 4. Inference Engine uses a CMake-based build system. In the created `build` directory, run `cmake` to fetch project dependencies and create Unix makefiles, then run `make` to build the project:
 ```sh
   cmake -DCMAKE_BUILD_TYPE=Release ..
-  make -j16
+  make --jobs=$(nproc --all)
 ```
+### Additional Build Options
+
 You can use the following additional build options:
 - Internal JIT GEMM implementation is used by default.
 - To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=<path_to_MKL>` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17.1/mklml_mac_2019.0.1.20180928.tgz)
 
 - Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option.
 
+- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you want to use the automatically downloaded packages but you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed.
+
+- If the CMake-based build script can not find and download the OpenCV package that is supported on your platform, or if you want to use a custom build of the OpenCV library, refer to the [Use Custom OpenCV Builds](#use-custom-opencv-builds-for-inference-engine) section for details.
+
 - To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options:
 ```sh
   -DPYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3.7 \
@@ -184,6 +400,82 @@ You can use the following additional build options:
   -DPYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m
 ```
 
+## Use Custom OpenCV Builds for Inference Engine
+
+> **NOTE**: The recommended and tested version of OpenCV is 4.1. The minimum supported version is 3.4.0.  
+
+Required versions of OpenCV packages are downloaded automatically during the building Inference Engine library. If the build script can not find and download the OpenCV package that is supported on your platform, you can use one of the following options:
+
+* Download the most suitable version from the list of available pre-build packages from [https://download.01.org/opencv/2019/openvinotoolkit](https://download.01.org/opencv/2019/openvinotoolkit) from the `<release_version>/inference_engine` directory.
+
+* Use a system provided OpenCV package (e.g with running the `apt install libopencv-dev` command). The following modules must be enabled: `imgcodecs`, `videoio`, `highgui`.
+
+* Get the OpenCV package using a package manager: pip, conda, conan etc. The package must have the development components included (header files and CMake scripts).
+
+* Build OpenCV from source using the [build instructions](https://docs.opencv.org/master/df/d65/tutorial_table_of_content_introduction.html) on the OpenCV site. 
+  
+After you got the built OpenCV library, perform the following preparation steps before running the Inference Engine build:
+  
+1. Set the `OpenCV_DIR` environment variable to the directory where the `OpenCVConfig.cmake` file of you custom OpenCV build is located.
+2. Disable the package automatic downloading with using the `-DENABLE_OPENCV=OFF` option for CMake-based build script for Inference Engine.
+
+## (Optional) Additional Installation Steps for the Intel® Movidiusâ„¢ Neural Compute Stick and Neural Compute Stick 2
+
+> **NOTE**: These steps are only required if you want to perform inference on Intel® Movidiusâ„¢ Neural Compute Stick or the Intel® Neural Compute Stick 2 using the Inference Engine MYRIAD Plugin. See also [Intel® Neural Compute Stick 2 Get Started](https://software.intel.com/en-us/neural-compute-stick/get-started)
+
+### For Linux, Raspbian\* Stretch OS
+
+1. Add the current Linux user to the `users` group:
+```sh
+sudo usermod -a -G users "$(whoami)"
+```
+Log out and log in for it to take effect.
+
+2. To perform inference on Intel® Movidiusâ„¢ Neural Compute Stick and Intel® Neural Compute Stick 2, install the USB rules as follows:
+```sh
+cat <<EOF > 97-myriad-usbboot.rules
+SUBSYSTEM=="usb", ATTRS{idProduct}=="2150", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0666", ENV{ID_MM_DEVICE_IGNORE}="1"
+SUBSYSTEM=="usb", ATTRS{idProduct}=="2485", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0666", ENV{ID_MM_DEVICE_IGNORE}="1"
+SUBSYSTEM=="usb", ATTRS{idProduct}=="f63b", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0666", ENV{ID_MM_DEVICE_IGNORE}="1"
+EOF
+```
+```sh
+sudo cp 97-myriad-usbboot.rules /etc/udev/rules.d/
+```
+```sh
+sudo udevadm control --reload-rules
+```
+```sh
+sudo udevadm trigger
+```
+```sh
+sudo ldconfig
+```
+```sh
+rm 97-myriad-usbboot.rules
+```
+
+### For Windows
+
+For Intel® Movidiusâ„¢ Neural Compute Stick and Intel® Neural Compute Stick 2, install the Movidiusâ„¢ VSC driver:
+1. Go to the `<DLDT_ROOT_DIR>/inference-engine/thirdparty/movidius/MovidiusDriver` directory, where the `DLDT_ROOT_DIR` is the directory to which the DLDT repository was cloned. 
+2. Right click on the `Movidius_VSC_Device.inf` file and choose **Install** from the pop up menu.
+
+You have installed the driver for your Intel® Movidiusâ„¢ Neural Compute Stick or Intel® Neural Compute Stick 2.
+
+## Next Steps
+
+Congratulations, you have built the Inference Engine. To get started with the OpenVINOâ„¢ DLDT, proceed to the Get Started guides:
+
+* [Get Started with Deep Learning Deployment Toolkit on Linux*](../get-started-linux.md)
+
+## Additional Resources
+
+* [OpenVINOâ„¢ Release Notes](https://software.intel.com/en-us/articles/OpenVINO-RelNotes)
+* [Introduction to Intel® Deep Learning Deployment Toolkit](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Introduction.html)
+* [Inference Engine Samples Overview](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Samples_Overview.html)
+* [Inference Engine Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_IE_DG_Deep_Learning_Inference_Engine_DevGuide.html)
+* [Model Optimizer Developer Guide](https://docs.openvinotoolkit.org/latest/_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html)
 
 ---
-\* Other names and brands may be claimed as the property of others.
+\* Other names and brands may be claimed as the property of others.
\ No newline at end of file
index 00861fa..0b137b6 100644 (file)
@@ -30,6 +30,7 @@ endif()
 if (APPLE)
     set(ENABLE_GNA OFF)
     set(ENABLE_CLDNN OFF)
+    SET(ENABLE_MYRIAD OFF)
 endif()
 
 
@@ -60,6 +61,14 @@ if (NOT ENABLE_MKL_DNN)
     set(ENABLE_MKL OFF)
 endif()
 
+if (NOT ENABLE_VPU)
+    set(ENABLE_MYRIAD OFF)
+endif()
+
+if (NOT ENABLE_MYRIAD)
+    set(ENABLE_VPU OFF)
+endif()
+
 #next section set defines to be accesible in c++/c code for certain feature
 if (ENABLE_PROFILING_RAW)
     add_definitions(-DENABLE_PROFILING_RAW=1)
@@ -69,6 +78,22 @@ if (ENABLE_CLDNN)
     add_definitions(-DENABLE_CLDNN=1)
 endif()
 
+if (ENABLE_MYRIAD)
+    add_definitions(-DENABLE_MYRIAD=1)
+endif()
+
+if (ENABLE_MYX_PCIE AND ENABLE_MYRIAD)
+    add_definitions(-DENABLE_MYX_PCIE=1)
+endif()
+
+if (ENABLE_MYRIAD_NO_BOOT AND ENABLE_MYRIAD )
+    add_definitions(-DENABLE_MYRIAD_NO_BOOT=1)
+endif()
+
+if (ENABLE_MYX_PCIE AND ENABLE_MYRIAD_NO_BOOT)
+    message(FATAL_ERROR "ENABLE_MYX_PCIE and ENABLE_MYRIAD_NO_BOOT can't be enabled at the same time")
+endif()
+
 if (ENABLE_MKL_DNN)
     add_definitions(-DENABLE_MKL_DNN=1)
 endif()
index d8ae9e1..d3cdf95 100644 (file)
@@ -37,6 +37,24 @@ else()
     set(MODELS_BRANCH "master")
 endif()
 
+if (ENABLE_MYRIAD)
+    RESOLVE_DEPENDENCY(VPU_FIRMWARE_MA2450
+            ARCHIVE_UNIFIED firmware_ma2450_491.zip
+            TARGET_PATH "${TEMP}/vpu/firmware/ma2450"
+            ENVIRONMENT "VPU_FIRMWARE_MA2450"
+            FOLDER)
+    debug_message(STATUS "ma2450=" ${VPU_FIRMWARE_MA2450})
+endif ()
+
+if (ENABLE_MYRIAD)
+    RESOLVE_DEPENDENCY(VPU_FIRMWARE_MA2480
+            ARCHIVE_UNIFIED firmware_ma2480_mdk_R7_9.zip
+            TARGET_PATH "${TEMP}/vpu/firmware/ma2480"
+            ENVIRONMENT "VPU_FIRMWARE_MA2480"
+            FOLDER)
+    debug_message(STATUS "ma2480=" ${VPU_FIRMWARE_MA2480})
+endif ()
+
 ## enable cblas_gemm from OpenBLAS package
 if (GEMM STREQUAL "OPENBLAS")
 if(NOT BLAS_LIBRARIES OR NOT BLAS_INCLUDE_DIRS)
index 054988c..9498744 100644 (file)
@@ -62,6 +62,14 @@ list (APPEND IE_OPTIONS IE_DEBUG_POSTFIX)
 set(IE_RELEASE_POSTFIX "${IE_RELEASE_POSTFIX}" CACHE STRING "Release postfix" FORCE)
 list (APPEND IE_OPTIONS IE_RELEASE_POSTFIX)
 
+ie_option (ENABLE_VPU "vpu targeted plugins for inference engine" ON)
+
+ie_option (ENABLE_MYRIAD "myriad targeted plugin for inference engine" ON)
+
+ie_option (ENABLE_MYX_PCIE "myriad plugin with support PCIE device" OFF)
+
+ie_option (ENABLE_MYRIAD_NO_BOOT "myriad plugin will skip device boot" OFF)
+
 ie_option (ENABLE_TESTS "unit and functional tests" OFF)
 
 ie_option (ENABLE_GAPI_TESTS "unit tests for GAPI kernels" OFF)
index f4a1f55..84a383a 100644 (file)
@@ -8,7 +8,7 @@ This topic demonstrates how to run the Benchmark Application demo, which perform
 
 Upon the start-up, the application reads command-line parameters and loads a network and images to the Inference Engine plugin. The number of infer requests and execution approach depend on a mode defined with the `-api` command-line parameter.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ### Synchronous API
 For synchronous mode, the primary metric is latency. The application creates one infer request and executes the `Infer` method. A number of executions is defined by one of the two values:
index a4eec40..2d0d95c 100644 (file)
@@ -3,13 +3,13 @@
 This topic demonstrates how to run the Image Classification sample application, which performs
 inference using image classification networks such as AlexNet and GoogLeNet.
 
-### How It Works
+## How It Works
 
 Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference
 Engine plugin. When inference is done, the application creates an
 output image and outputs data to the standard output stream.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
@@ -62,18 +62,16 @@ For example, to perform inference of an AlexNet model (previously converted to t
     python3 classification_sample.py -i <path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml
 ```
 
-### Sample Output
+## Sample Output
 
 By default the application outputs top-10 inference results.
 Add the `-nt` option to the previous command to modify the number of top output results.
 For example, to get the top-5 results on GPU, run the following command:
 ```
-    python3 classification_sample.py<path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml -nt 5 -d GPU
+    python3 classification_sample.py -i <path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml -nt 5 -d GPU
 ```
 
 ## See Also
 * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
 * [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
 * [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader)
-
-
index e121f4a..4a91142 100644 (file)
@@ -16,7 +16,7 @@ Another required aspect of good throughput is a number of iterations. Only with
 
 The batch mode is an independent attribute on the pipelined mode. Pipelined mode works efficiently with any batch size.
 
-### How It Works
+## How It Works
 
 Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference
 Engine plugin.
@@ -26,13 +26,13 @@ Then in a loop it starts inference for the current infer request and switches to
 
 When inference is done, the application outputs data to the standard output stream.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
 Running the application with the <code>-h</code> option yields the following usage message:
 ```
-python3 classification_sample_async.py -h 
+python3 classification_sample_async.py -h
 ```
 The command yields the following usage message:
 ```
@@ -80,7 +80,7 @@ You can do inference on an image using a trained AlexNet network on FPGA with fa
     python3 classification_sample_async.py -i <path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml -nt 5 -d HETERO:FPGA,CPU -nireq 2 -ni 200
 ```
 
-### Sample Output
+## Sample Output
 
 By default, the application outputs top-10 inference results for each infer request.
 It also provides throughput value measured in frames per seconds.
index 2c5fa61..cdca581 100644 (file)
@@ -7,7 +7,7 @@ inference of style transfer models.
 
 ## How It Works
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index b138d3a..5746467 100644 (file)
@@ -74,7 +74,7 @@ public:
     ConcatLayer& setAxis(size_t axis);
 
 private:
-    size_t axis;
+    size_t axis = 1;
 };
 
 }  // namespace Builder
index 370cd68..59a2012 100644 (file)
@@ -98,7 +98,7 @@ public:
     EltwiseLayer& setScales(const std::vector<float>& scales);
 
 private:
-    EltwiseType type;
+    EltwiseType type = SUM;
 };
 
 }  // namespace Builder
index 625de12..fcf58af 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index b732a49..bc36ec4 100644 (file)
@@ -161,8 +161,8 @@ public:
     PoolingLayer& setExcludePad(bool exclude);
 
 private:
-    PoolingType type;
-    RoundingType roundingType;
+    PoolingType type = MAX;
+    RoundingType roundingType = CEIL;
 };
 
 }  // namespace Builder
index 0cef8cf..e70b778 100644 (file)
@@ -44,6 +44,9 @@ public:
     const Version *GetVersion() {
         const Version *versionInfo = nullptr;
         actual->GetVersion(versionInfo);
+        if (versionInfo == nullptr) {
+            THROW_IE_EXCEPTION << "Unknown device is used";
+        }
         return versionInfo;
     }
 
index 7d77bc8..70b5050 100644 (file)
@@ -23,8 +23,9 @@ namespace details {
 template<class NT, class LT>
 class INetworkIterator: public std::iterator<std::input_iterator_tag, std::shared_ptr<LT>> {
 public:
-    explicit INetworkIterator(NT * network, bool toEnd = false): network(network), currentIdx(0) {
-        if (!network || toEnd)
+    explicit INetworkIterator(NT * network, bool toEnd): network(network), currentIdx(0) {}
+    explicit INetworkIterator(NT * network): network(network), currentIdx(0) {
+        if (!network)
             return;
         const auto& inputs = network->getInputs();
 
index 269cba2..32f3e10 100644 (file)
@@ -30,7 +30,7 @@ class SharedObjectLoader {
 private:
     HMODULE shared_object;
 
- public:
+public:
     /**
      * @brief Loads a library with the name specified. The library is loaded according to the
      *        WinAPI LoadLibrary rules
@@ -38,6 +38,20 @@ private:
      */
     explicit SharedObjectLoader(LPCTSTR pluginName) {
         char cwd[1024];
+        // Exclude current directory from DLL search path process wise.
+        // If application specific path was configured before then
+        // current directory is alread excluded.
+        // GetDLLDirectory does not distinguish if aplication specific
+        // path was set to "" or NULL so reset it to "" to keep
+        // aplication safe.
+        if (GetDllDirectory(0, NULL) <= 1) {
+            SetDllDirectory(
+#if defined UNICODE
+                L"");
+#else
+                "");
+#endif
+        }
         shared_object = LoadLibrary(pluginName);
         if (!shared_object) {
             THROW_IE_EXCEPTION << "Cannot load library '"
index c96a01b..8e4cf7e 100644 (file)
@@ -82,7 +82,7 @@ public:
      * @brief Constructor. Creates an empty Blob object with the specified precision.
      * @param tensorDesc Defines the layout and dims of the blob
      */
-    explicit Blob(TensorDesc tensorDesc): tensorDesc(tensorDesc) {}
+    explicit Blob(const TensorDesc &tensorDesc): tensorDesc(tensorDesc) {}
 
     /**
      * @deprecated Please use TensorDesc for Blob initialization
@@ -126,17 +126,21 @@ public:
      * @return Total number of elements (a product of all the dimensions)
      */
     size_t Resize(const SizeVector &dims, Layout layout = Layout::ANY) noexcept {
-        bool bret = deallocate();
-
-        if (layout != Layout::ANY) {
-            tensorDesc = TensorDesc(tensorDesc.getPrecision(), SizeVector(dims.rbegin(), dims.rend()), layout);
-        } else {
-            tensorDesc.setDims(SizeVector(dims.rbegin(), dims.rend()));
-        }
-        if (!bret) {
-            allocate();
+        try {
+            bool bret = deallocate();
+
+            if (layout != Layout::ANY) {
+                tensorDesc = TensorDesc(tensorDesc.getPrecision(), SizeVector(dims.rbegin(), dims.rend()), layout);
+            } else {
+                tensorDesc.setDims(SizeVector(dims.rbegin(), dims.rend()));
+            }
+            if (!bret) {
+                allocate();
+            }
+            return product(tensorDesc.getDims());
+        } catch (...) {
+            return 0;
         }
-        return product(tensorDesc.getDims());
     }
 
     /**
@@ -147,16 +151,20 @@ public:
      * @return The total number of elements (a product of all the dims)
      */
     size_t Reshape(const SizeVector &dims, Layout layout = Layout::ANY) noexcept {
-        if (product(tensorDesc.getDims()) != product(dims)) {
+        try {
+            if (product(tensorDesc.getDims()) != product(dims)) {
+                return 0;
+            }
+
+            if (layout != Layout::ANY) {
+                tensorDesc = TensorDesc(tensorDesc.getPrecision(), SizeVector(dims.rbegin(), dims.rend()), layout);
+            } else {
+                tensorDesc.setDims(SizeVector(dims.rbegin(), dims.rend()));
+            }
+            return product(tensorDesc.getDims());
+        } catch (...) {
             return 0;
         }
-
-        if (layout != Layout::ANY) {
-            tensorDesc = TensorDesc(tensorDesc.getPrecision(), SizeVector(dims.rbegin(), dims.rend()), layout);
-        } else {
-            tensorDesc.setDims(SizeVector(dims.rbegin(), dims.rend()));
-        }
-        return product(tensorDesc.getDims());
     }
 
     /**
index 6dc7c4e..778d382 100644 (file)
@@ -27,10 +27,8 @@ enum class TargetDevice : uint8_t {
     eGPU = 3,
     eFPGA = 4,
     eMYRIAD = 5,
-    eHDDL = 6,
     eGNA = 7,
     eHETERO = 8,
-    eKMB = 9,
 };
 
 /**
@@ -52,10 +50,8 @@ class TargetDeviceInfo {
             DECL_DEVICE(GPU),
             DECL_DEVICE(FPGA),
             DECL_DEVICE(MYRIAD),
-            DECL_DEVICE(HDDL),
             DECL_DEVICE(GNA),
             DECL_DEVICE(HETERO),
-            DECL_DEVICE(KMB)
         };
 #undef DECLARE
         return g_allDeviceInfos;
@@ -68,11 +64,9 @@ class TargetDeviceInfo {
             { "GPU", InferenceEngine::TargetDevice::eGPU },
             { "FPGA", InferenceEngine::TargetDevice::eFPGA },
             { "MYRIAD", InferenceEngine::TargetDevice::eMYRIAD },
-            { "HDDL", InferenceEngine::TargetDevice::eHDDL },
             { "GNA", InferenceEngine::TargetDevice::eGNA },
             { "BALANCED", InferenceEngine::TargetDevice::eBalanced },
             { "HETERO", InferenceEngine::TargetDevice::eHETERO },
-            { "KMB", InferenceEngine::TargetDevice::eKMB }
         };
         auto val = deviceFromNameMap.find(deviceName);
         return val != deviceFromNameMap.end() ? val->second : InferenceEngine::TargetDevice::eDefault;
index 3e1b9bb..c3e867e 100644 (file)
@@ -701,7 +701,7 @@ public:
     /**
     * @brief A pad value which is used to fill pad area
     */
-    float _pad_value = -1.0f;
+    float _pad_value = 0.0f;
 
     /**
      * @brief A convolution kernel array [X, Y, Z, ...]
index af72214..01c0c0e 100644 (file)
@@ -11,6 +11,8 @@
 
 #pragma once
 
+#include <cstddef>
+
 #define IE_THREAD_TBB 0
 #define IE_THREAD_OMP 1
 #define IE_THREAD_SEQ 2
@@ -70,7 +72,7 @@ inline void parallel_set_num_threads(int n) { return; }
 namespace InferenceEngine {
 
 template <typename F>
-void parallel_nt(int nthr, func) {
+void parallel_nt(int nthr, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     if (nthr == 0) nthr = parallel_get_max_threads();
     if (nthr == 1) {
@@ -95,7 +97,7 @@ void parallel_nt(int nthr, F func) {
 }
 
 template <typename F>
-void parallel_nt_static(int nthr, func) {
+void parallel_nt_static(int nthr, const F &func) {
 #if IE_THREAD == IE_THREAD_SEQ
     const bool serial = true;
 #else
@@ -124,7 +126,7 @@ void parallel_nt_static(int nthr, F func) {
 }
 
 template <typename T0, typename R, typename F>
-R parallel_sum(const T0 D0, R &input, F func) {
+R parallel_sum(const T0 &D0, const R &input, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     return tbb::parallel_reduce(
         tbb::blocked_range<T0>(0, D0), input,
@@ -157,7 +159,7 @@ R parallel_sum(const T0 D0, R &input, F func) {
 }
 
 template <typename T0, typename T1, typename R, typename F>
-R parallel_sum2d(const T0 D0, const T1 D1, R input, F func) {
+R parallel_sum2d(const T0 &D0, const T1 &D1, const R &input, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     return tbb::parallel_reduce(
         tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
@@ -196,7 +198,7 @@ R parallel_sum2d(const T0 D0, const T1 D1, R input, F func) {
 #endif
 }
 template <typename T0, typename T1, typename T2, typename R, typename F>
-R parallel_sum3d(const T0 D0, const T1 D1, const T2 D2, R input, F func) {
+R parallel_sum3d(const T0 &D0, const T1 &D1, const T2 &D2, const R &input, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     return tbb::parallel_reduce(
         tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
@@ -261,7 +263,7 @@ inline bool parallel_it_step(Q &x, const R &X, Args &&... tuple) {
 }
 
 template <typename T, typename Q>
-inline void splitter(T n, Q team, Q tid, T &n_start, T &n_end) {
+inline void splitter(const T &n, const Q &team, const Q &tid, T &n_start, T &n_end) {
     if (team <= 1 || n == 0) {
         n_start = 0;
         n_end = n;
@@ -278,14 +280,14 @@ inline void splitter(T n, Q team, Q tid, T &n_start, T &n_end) {
 
 
 template <typename T0, typename F>
-void for_1d(const int ithr, const int nthr, const T0 &D0, F func) {
+void for_1d(const int &ithr, const int &nthr, const T0 &D0, const F &func) {
     T0 d0{ 0 }, end{ 0 };
     splitter(D0, nthr, ithr, d0, end);
     for (; d0 < end; ++d0) func(d0);
 }
 
 template <typename T0, typename F>
-void parallel_for(const T0 &D0, func) {
+void parallel_for(const T0 &D0, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     const int nthr = parallel_get_max_threads();
     tbb::parallel_for(0, nthr, [&](int ithr) {
@@ -301,7 +303,7 @@ void parallel_for(const T0 &D0, F func) {
 
 
 template <typename T0, typename T1, typename F>
-void for_2d(const int ithr, const int nthr, const T0 &D0, const T1 &D1, F func) {
+void for_2d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1, const F &func) {
     const size_t work_amount = (size_t)D0 * D1;
     if (work_amount == 0) return;
     size_t start{ 0 }, end{ 0 };
@@ -316,7 +318,7 @@ void for_2d(const int ithr, const int nthr, const T0 &D0, const T1 &D1, F func)
 }
 
 template <typename T0, typename T1, typename F>
-void parallel_for2d(const T0 &D0, const T1 &D1, func) {
+void parallel_for2d(const T0 &D0, const T1 &D1, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     const int nthr = parallel_get_max_threads();
     tbb::parallel_for(0, nthr, [&](int ithr) {
@@ -332,8 +334,8 @@ void parallel_for2d(const T0 &D0, const T1 &D1, F func) {
 
 
 template <typename T0, typename T1, typename T2, typename F>
-void for_3d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
-    const T2 &D2, func) {
+void for_3d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
+    const T2 &D2, const F &func) {
     const size_t work_amount = (size_t)D0 * D1 * D2;
     if (work_amount == 0) return;
     size_t start{ 0 }, end{ 0 };
@@ -348,7 +350,7 @@ void for_3d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
 }
 
 template <typename T0, typename T1, typename T2, typename F>
-void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, func) {
+void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     const int nthr = parallel_get_max_threads();
     tbb::parallel_for(0, nthr, [&](int ithr) {
@@ -363,8 +365,8 @@ void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, F func) {
 }
 
 template <typename T0, typename T1, typename T2, typename T3, typename F>
-void for_4d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
-    const T2 &D2, const T3 &D3, func) {
+void for_4d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
+    const T2 &D2, const T3 &D3, const F &func) {
     const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
     if (work_amount == 0) return;
     size_t start{ 0 }, end{ 0 };
@@ -379,7 +381,7 @@ void for_4d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
 }
 
 template <typename T0, typename T1, typename T2, typename T3, typename F>
-void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, func) {
+void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     const int nthr = parallel_get_max_threads();
     tbb::parallel_for(0, nthr, [&](int ithr) {
@@ -394,8 +396,8 @@ void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, F fu
 }
 
 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
-void for_5d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
-        const T2 &D2, const T3 &D3, const T4 &D4, func) {
+void for_5d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
+        const T2 &D2, const T3 &D3, const T4 &D4, const F &func) {
     const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
     if (work_amount == 0) return;
     size_t start{ 0 }, end{ 0 };
@@ -411,7 +413,7 @@ void for_5d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
 
 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
 void parallel_for5d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
-                    const T4 &D4, func) {
+                    const T4 &D4, const F &func) {
 #if IE_THREAD == IE_THREAD_TBB
     const int nthr = parallel_get_max_threads();
     tbb::parallel_for(0, nthr, [&](int ithr) {
@@ -427,7 +429,7 @@ void parallel_for5d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
 
 
 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
-void for_6d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
+void for_6d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
         const T2 &D2, const T3 &D3, const T4 &D4, const T5 &D5, F func) {
     const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
     if (work_amount == 0) return;
index 8726ae6..32f4e98 100644 (file)
@@ -83,27 +83,32 @@ public:
     /** @brief checks whether given storage class T can be used to store objects of current precision */
     template <class T>
     bool hasStorageType(const char * typeName = nullptr) const noexcept {
-        if (precisionInfo.value != BIN) {
-            if (sizeof(T) != size()) {
-                return false;
+        try {
+            if (precisionInfo.value != BIN) {
+                if (sizeof(T) != size()) {
+                    return false;
+                }
             }
-        }
 #define CASE(x, y) case x: return std::is_same<T, y>()
 #define CASE2(x, y1, y2) case x: return std::is_same<T, y1>() || std::is_same<T, y2>()
 
-        switch (precisionInfo.value) {
-            CASE(FP32, float);
-            CASE2(FP16, int16_t, uint16_t);
-            CASE(I16, int16_t);
-            CASE(I32, int32_t);
-            CASE(U16, uint16_t);
-            CASE(U8, uint8_t);
-            CASE(I8, int8_t);
-            CASE2(Q78, int16_t, uint16_t);
-            CASE2(BIN, int8_t, uint8_t);
-            default : return areSameStrings(name(), typeName == nullptr ? typeid(T).name() : typeName);
+            switch (precisionInfo.value) {
+                CASE(FP32, float);
+                CASE2(FP16, int16_t, uint16_t);
+                CASE(I16, int16_t);
+                CASE(I32, int32_t);
+                CASE(U16, uint16_t);
+                CASE(U8, uint8_t);
+                CASE(I8, int8_t);
+                CASE2(Q78, int16_t, uint16_t);
+                CASE2(BIN, int8_t, uint8_t);
+                default :
+                    return areSameStrings(name(), typeName == nullptr ? typeid(T).name() : typeName);
 #undef CASE
 #undef CASE2
+            }
+        } catch (...) {
+            return false;
         }
     }
 
@@ -172,7 +177,7 @@ public:
 
     /**
      * @brief Returns size in bytes of single element of that precision
-     * @deprecated : size of precision will be report in bits in future releases
+     * @deprecated : size of precision will be reported in bits in future releases
      */
     size_t size() const {
         if (precisionInfo.bitsSize == 0) {
@@ -182,7 +187,7 @@ public:
     }
 
     /** @brief Checks if it is a floating point */
-    bool is_float() const {
+    bool is_float() const noexcept {
         return precisionInfo.isFloat;
     }
 
@@ -306,7 +311,7 @@ inline Precision::PrecisionInfo Precision::makePrecisionInfo(const char *name) {
     Precision::PrecisionInfo info;
     info.name = name;
 
-    int nBits = precision == BIN ? 1 : 8;
+    size_t nBits = precision == BIN ? 1 : 8;
     info.bitsSize = nBits * type_size_or_zero<typename PrecisionTrait<precision>::value_type>();
     info.isFloat = is_floating<precision>();
     info.value = precision;
index c6cd1e9..e0ef157 100644 (file)
 #define DECLARE_VPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(VPU_##name)
 #define DECLARE_VPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(VPU_##name)
 
-#define VPU_HDDL_CONFIG_KEY(name) InferenceEngine::VPUConfigParams::_CONFIG_KEY(VPU_HDDL_##name)
-#define VPU_HDDL_CONFIG_VALUE(name) InferenceEngine::VPUConfigParams::VPU_HDDL_##name
-
-#define DECLARE_VPU_HDDL_CONFIG_KEY(name) DECLARE_CONFIG_KEY(VPU_HDDL_##name)
-#define DECLARE_VPU_HDDL_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(VPU_HDDL_##name)
-
 namespace InferenceEngine {
 namespace VPUConfigParams {
 
@@ -69,92 +63,6 @@ DECLARE_VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME);
 DECLARE_VPU_CONFIG_KEY(FORCE_RESET);
 
 /**
- * @brief [Only for HDDLPlugin]
- * Type: Arbitrary non-empty string. If empty (""), equals no set, default: "";
- * This option allows to specify the number of MYX devices used for inference a specific Executable network.
- * Note: Only one network would be allocated to one device.
- * The number of devices for the tag is specified in the hddl_service.config file.
- * Example:
- * "service_settings":
- * {
- *     "graph_tag_map":
- *     {
- *         "tagA":3
- *     }
- * }
- * It means that an executable network marked with tagA will be executed on 3 devices
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(GRAPH_TAG);
-
-/**
- * @brief [Only for HDDLPlugin]
- * Type: Arbitrary non-empty string. If empty (""), equals no set, default: "";
- * This config makes the executable networks to be allocated on one certain device (instead of multiple devices).
- * And all inference through this executable network, will be done on this device.
- * Note: Only one network would be allocated to one device.
- * The number of devices which will be used for stream-affinity must be specified in hddl_service.config file.
- * Example:
- * "service_settings":
- * {
- *     "stream_device_number":5
- * }
- * It means that 5 device will be used for stream-affinity
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(STREAM_ID);
-
-/**
- * @brief [Only for HDDLPlugin]
- * Type: Arbitrary non-empty string. If empty (""), equals no set, default: "";
- * This config allows user to control device flexibly. This config gives a "tag" for a certain device while
- * allocating a network to it. Afterward, user can allocating/deallocating networks to this device with this "tag".
- * Devices used for such use case is controlled by a so-called "Bypass Scheduler" in HDDL backend, and the number
- * of such device need to be specified in hddl_service.config file.
- * Example:
- * "service_settings":
- * {
- *     "bypass_device_number": 5
- * }
- * It means that 5 device will be used for Bypass scheduler.
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(DEVICE_TAG);
-
-/**
- * @brief [Only for HDDLPlugin]
- * Type: "YES/NO", default is "NO".
- * This config is a sub-config of DEVICE_TAG, and only available when "DEVICE_TAG" is set. After a user load a
- * network, the user got a handle for the network.
- * If "YES", the network allocated is bind to the device (with the specified "DEVICE_TAG"), which means all afterwards
- * inference through this network handle will be executed on this device only.
- * If "NO", the network allocated is not bind to the device (with the specified "DEVICE_TAG"). If the same network
- * is allocated on multiple other devices (also set BIND_DEVICE to "False"), then inference through any handle of these
- * networks may be executed on any of these devices those have the network loaded.
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(BIND_DEVICE);
-
-/**
- * @brief [Only for HDDLPlugin]
- * Type: A signed int wrapped in a string, default is "0".
- * This config is a sub-config of DEVICE_TAG, and only available when "DEVICE_TAG" is set and "BIND_DEVICE" is "False".
- * When there are multiple devices running a certain network (a same network running on multiple devices in Bypass Scheduler),
- * the device with a larger number has a higher priority, and more inference tasks will be fed to it with priority.
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(RUNTIME_PRIORITY);
-
-
-/**
- * @brief [Only for HDDLPlugin]
- * Type: "YES/NO", default is "NO". **Note: ONLY available when "DEVICE_TAG" is set.
- * This config should be used only when the network has been loaded already with the same network content, the same
- * "DEVICE_TAG" as used this time and "BIND_DEVICE" of the loaded network had been set to "NO".
- * This config is only used to update the "RUNTIME_PRIORITY" of previous loaded network, and the application should keep using
- * the network handle that previous allocated to do inference.
- * - If "Yes": the "RUNTIME_PRIORITY" must be specified with a integer, and it will be set as the new runtime priority for that network on that device.
- * - If "No": load this network to deivce.
- * **Note: If "BIND_DEVICE" of the previously loaded network was "Yes", the behavior of "update runtime priority" is undefined.
- */
-DECLARE_VPU_HDDL_CONFIG_KEY(UPDATE_RUNTIME_PRIORITY);
-
-/**
  * @brief This option allows to pass extra configuration for executable network.
  * By default, it is empty string, which means - no configuration.
  * String format:
index 12dfaca..4f1849f 100755 (executable)
@@ -22,7 +22,6 @@ function yes_or_no {
 # install dependencies
 if [[ -f /etc/lsb-release ]]; then
     # Ubuntu
-    system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
     sudo -E apt update
     sudo -E apt-get install -y \
             build-essential \
@@ -52,12 +51,12 @@ if [[ -f /etc/lsb-release ]]; then
             gstreamer1.0-plugins-base \
             libusb-1.0-0-dev \
             libopenblas-dev
-    if [ $system_ver = "18.04" ]; then
-           sudo -E apt-get install -y libpng-dev
+    if apt-cache search --names-only '^libpng12'| grep -q libpng12; then
+        sudo -E apt-get install -y libpng12-dev
     else
-           sudo -E apt-get install -y libpng12-dev
+        sudo -E apt-get install -y libpng-dev
     fi
-else
+elif [[ -f /etc/redhat-release ]]; then
     # CentOS 7.x
     sudo -E yum install -y centos-release-scl epel-release
     sudo -E yum install -y \
@@ -125,5 +124,6 @@ else
         echo "FFmpeg installation skipped. You may build FFmpeg from sources as described here: https://trac.ffmpeg.org/wiki/CompilationGuide/Centos"
         echo
     fi
-
+else
+    echo "Unknown OS, please install build dependencies manually"
 fi
\ No newline at end of file
index da00b43..4e13c11 100644 (file)
@@ -59,6 +59,11 @@ if (WIN32)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS -DNOMINMAX")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
+
+    if (TREAT_WARNING_AS_ERROR)
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX") #treating warnings as errors
+    endif ()
+
     if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251 /wd4275 /wd4267") #disable some warnings
     endif()
index 23c17e4..8cba50d 100644 (file)
@@ -14,7 +14,7 @@ Upon start-up, the application reads command-line parameters and loads a network
 plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend
 on the mode defined with the `-api` command-line parameter.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 If you run the application in the synchronous mode, it creates one infer request and executes the `Infer` method.
 If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq`
index 3174582..56b12a5 100644 (file)
@@ -11,6 +11,7 @@
 #include <utility>
 
 #include <inference_engine.hpp>
+#include <ext_list.hpp>
 #include <format_reader_ptr.h>
 
 #include <vpu/vpu_plugin_config.hpp>
@@ -60,18 +61,6 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
         throw std::logic_error("Input is not set. Please use -h.");
     }
 
-    if (FLAGS_niter < 0) {
-        throw std::logic_error("Number of iterations should be positive (invalid -niter option value)");
-    }
-
-    if (FLAGS_nireq < 0) {
-        throw std::logic_error("Number of inference requests should be positive (invalid -nireq option value)");
-    }
-
-    if (FLAGS_b < 0) {
-        throw std::logic_error("Batch size should be positive (invalid -b option value)");
-    }
-
     if (!FLAGS_report_type.empty() &&
          FLAGS_report_type != noCntReport && FLAGS_report_type != medianCntReport && FLAGS_report_type != detailedCntReport) {
         std::string err = "only " + std::string(noCntReport) + "/" + std::string(medianCntReport) + "/" + std::string(detailedCntReport) +
@@ -113,12 +102,19 @@ int main(int argc, char *argv[]) {
 
         InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d);
 
-        if (!FLAGS_l.empty()) {
-            // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
-            const std::shared_ptr<IExtension> extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
-            plugin.AddExtension(extension_ptr);
-            slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
-        } else if (!FLAGS_c.empty()) {
+        if (FLAGS_d.find("CPU") != std::string::npos) {
+            // Loading default CPU etensions
+            plugin.AddExtension(std::make_shared<Extensions::Cpu::CpuExtensions>());
+
+            if (!FLAGS_l.empty()) {
+                // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
+                const auto extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
+                plugin.AddExtension(extension_ptr);
+                slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
+            }
+        }
+
+        if ((FLAGS_d.find("GPU") != std::string::npos) && !FLAGS_c.empty()) {
             // Load clDNN Extensions
             plugin.SetConfig({ {CONFIG_KEY(CONFIG_FILE), FLAGS_c} });
             slog::info << "GPU extensions is loaded " << FLAGS_c << slog::endl;
index 248d7cd..909fbcb 100644 (file)
@@ -34,7 +34,7 @@ public:
         std::string report_folder;
     };
 
-    explicit StatisticsReport(Config config) : _config(std::move(config)) {
+    explicit StatisticsReport(const Config &config) : _config(config) {
         if (_config.niter > 0) {
             _performanceCounters.reserve(_config.niter);
         }
index da9d3eb..2883873 100644 (file)
@@ -1,11 +1,13 @@
-# Calibration Tool
+# C++ Calibration Tool [DEPRECATED]
 
-Inference Engine Calibration Tool calibrates a given FP32 model so that is can be run in low-precision 8-bit integer
+> **NOTE**: OpenVINO 2019 R1 release introduced a [Python\* version of the Calibration Tool](./inference-engine/tools/calibration_tool/README.md). This is now a recommended version since it supports a larger set of topologies and datasets. The [C++ version of the Calibration Tool](./inference-engine/samples/calibration_tool/README.md) is still in the package but deprecated and will not be updated for new releases.
+
+The C++ Calibration Tool calibrates a given FP32 model so that is can be run in low-precision 8-bit integer
 mode while keeping the input data of this model in the original precision.
 
 > **NOTE**: INT8 models are currently supported only by the CPU plugin. For the full list of supported configurations, see the [Supported Devices](./docs/IE_DG/supported_plugins/Supported_Devices.md) topic.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Calibration Tool Options
 
index e6a00b8..5836f16 100644 (file)
@@ -33,6 +33,9 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer:
     params.type = "ScaleShift";
     CNNLayerPtr lptr = std::make_shared<ScaleShiftLayer>(params);
     ScaleShiftLayer *pScaleShift = dynamic_cast<ScaleShiftLayer *>(lptr.get());
+    if (pScaleShift == nullptr) {
+        THROW_IE_EXCEPTION << "Layer " << lptr->name << " is not instance of ScaleShiftLayer class";
+    }
 
     SizeVector wdims({ pData->dims[2] });
 
@@ -94,10 +97,14 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer:
 
 
 float Int8Calibrator::compare_NRMSD(InferenceEngine::Blob::Ptr res, InferenceEngine::Blob::Ptr ref) {
-    float *res_ptr = res->buffer().as<float *>();
+    auto *res_ptr = res->buffer().as<float *>();
+
+    auto *ref_ptr = ref->buffer().as<float *>();
 
-    float *ref_ptr = ref->buffer().as<float *>();
     size_t ref_size = ref->size();
+    if (ref_size == 0) {
+        throw std::logic_error("ref_size can't be equal to zero");
+    }
 
     float sum = 0;
 
@@ -111,9 +118,7 @@ float Int8Calibrator::compare_NRMSD(InferenceEngine::Blob::Ptr res, InferenceEng
         mmin = std::min(mmin, ref_ptr[i]);
         mmax = std::max(mmax, ref_ptr[i]);
     }
-    if (std::fabs(ref_size) < std::numeric_limits<double>::epsilon()) {
-        throw std::logic_error("ref_size can't be equal to zero");
-    }
+
     sum /= ref_size;
 
     sum = pow(sum, 0.5f);
@@ -278,6 +283,9 @@ CNNNetwork Int8Calibrator::createICNNNetworkForLayer(CNNLayer::Ptr layerToClone,
     size_t outputWidth = outputData->getTensorDesc().getDims()[3];
 
     ConvolutionLayer *pConvS = dynamic_cast<ConvolutionLayer *>(layerToClone.get());
+    if (pConvS == nullptr) {
+        THROW_IE_EXCEPTION << "Layer " << layerToClone->name << " is not instance of ConvolutionLayer class";
+    }
 
     std::string model = "<net name=\"L\" version=\"2\" batch=\"1\"><layers> "\
         "<layer name=\"" +
@@ -361,6 +369,10 @@ CNNNetwork Int8Calibrator::createICNNNetworkForLayer(CNNLayer::Ptr layerToClone,
     CNNLayerPtr convLayer;
     n.getLayerByName(layerToClone->name.c_str(), convLayer, nullptr);
     ConvolutionLayer *pConvT = dynamic_cast<ConvolutionLayer *>(convLayer.get());
+    if (pConvT == nullptr) {
+        THROW_IE_EXCEPTION << "Layer " << convLayer->name << " is not instance of ConvolutionLayer class";
+    }
+
     pConvT->_weights = pConvS->_weights;
     pConvT->_biases = pConvS->_biases;
     pConvT->blobs = pConvS->blobs;
index fdcfc12..d3c9737 100644 (file)
@@ -107,7 +107,7 @@ protected:
     InferenceEngine::InferRequest _inferRequestI8C;
     int _cBatch = 0;
 
-    size_t _nPictures;
+    size_t _nPictures = 0;
 
 private:
     /**
index 90ee2b0..2a63d4b 100644 (file)
@@ -425,7 +425,10 @@ int main(int argc, char *argv[]) {
             THROW_USER_EXCEPTION(2) <<  "Processor pointer is invalid" << FLAGS_ppType;
         }
 
-        Int8Calibrator* calibrator = dynamic_cast<Int8Calibrator*>(processor.get());
+        auto calibrator = dynamic_cast<Int8Calibrator*>(processor.get());
+        if (calibrator == nullptr) {
+            THROW_USER_EXCEPTION(2) << "processor object is not instance of Int8Calibrator class";
+        }
 
         if (netType != RawC && netType != RawOD) {
             slog::info << "Collecting accuracy metric in FP32 mode to get a baseline, collecting activation statistics" << slog::endl;
@@ -434,7 +437,10 @@ int main(int argc, char *argv[]) {
         }
         calibrator->collectFP32Statistic();
         shared_ptr<Processor::InferenceMetrics> pIMFP32 = processor->Process(FLAGS_stream_output);
-        const CalibrationMetrics* mFP32 = dynamic_cast<const CalibrationMetrics*>(pIMFP32.get());
+        const auto mFP32 = dynamic_cast<const CalibrationMetrics*>(pIMFP32.get());
+        if (mFP32 == nullptr) {
+            THROW_USER_EXCEPTION(2) << "FP32 inference metrics object is not instance of CalibrationMetrics class";
+        }
         std:: cout << "  FP32 Accuracy: " << OUTPUT_FLOATING(100.0 * mFP32->AccuracyResult) << "% " << std::endl;
 
         InferenceEngine::NetworkStatsMap statMap;
@@ -450,7 +456,10 @@ int main(int argc, char *argv[]) {
                 InferenceEngine::NetworkStatsMap tmpStatMap = calibrator->getStatistic(threshold);
                 calibrator->validateInt8Config(tmpStatMap, {}, FLAGS_convert_fc);
                 shared_ptr<Processor::InferenceMetrics> pIM_I8 = processor->Process(FLAGS_stream_output);
-                const CalibrationMetrics *mI8 = dynamic_cast<const CalibrationMetrics *>(pIM_I8.get());
+                auto *mI8 = dynamic_cast<const CalibrationMetrics *>(pIM_I8.get());
+                if (mI8 == nullptr) {
+                    THROW_USER_EXCEPTION(2) << "INT8 inference metrics object is not instance of CalibrationMetrics class";
+                }
                 if (maximalAccuracy < mI8->AccuracyResult) {
                     maximalAccuracy = mI8->AccuracyResult;
                     bestThreshold = threshold;
@@ -477,7 +486,7 @@ int main(int argc, char *argv[]) {
                     orderedLayersAccuracyDrop[d.second] = d.first;
                     layersToInt8[d.first] = true;
                 }
-                std::map<float, std::string>::const_reverse_iterator it = orderedLayersAccuracyDrop.crbegin();
+                auto it = orderedLayersAccuracyDrop.crbegin();
 
                 shared_ptr<Processor::InferenceMetrics> pIM_I8;
                 const CalibrationMetrics *mI8;
@@ -537,6 +546,12 @@ int main(int argc, char *argv[]) {
             showUsage();
             return ex.list().begin()->exitCode();
         }
+    } catch (const std::exception& ex) {
+        slog::err << ex.what() << slog::endl;
+        return 1;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return 1;
     }
     return 0;
 }
index 348e90f..9c22dc8 100644 (file)
@@ -11,7 +11,7 @@ Upon the start-up, the sample application reads command line parameters and load
 Engine plugin. When inference is done, the application creates an
 output image and outputs data to the standard output stream.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 Running the application with the `-h` option yields the following usage message:
index e5feedf..2afc8fd 100644 (file)
@@ -29,7 +29,7 @@ Then in a loop it starts inference for the current infer request and switches to
 
 When inference is done, the application outputs data to the standard output stream.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index bb479b7..ae841f6 100644 (file)
@@ -11,7 +11,7 @@ Please refer to [Object Detection for SSD Demo](./inference-engine/samples/objec
 [Security Barrier Camera Demo](./inference-engine/samples/security_barrier_camera_demo/README.md), or
 [Crossroad Camera Demo](./inference-engine/samples/crossroad_camera_demo/README.md) with an example of using of new crop ROI API.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index fd8d35b..261b883 100644 (file)
@@ -5,7 +5,7 @@ The sample is simplified version of [Image Classification Sample](./inference-en
 It demonstrates how to use the new Infer Request API of Inference Engine in applications. Refer to
 [Integrate the Inference Engine New Request API with Your Application](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index 0f3846e..e45be9d 100644 (file)
@@ -3,7 +3,7 @@
 This topic demonstrates how to run the Hello Shape Infer SSD application, which does inference using object detection
 networks like SSD-VGG. The sample shows how to use [Shape Inference feature](./docs/IE_DG/ShapeInference.md).
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index 6ba3d1b..1cb1961 100644 (file)
@@ -13,7 +13,7 @@ When inference is done, the application outputs inference results to the standar
 
 > **NOTE**: This sample is implemented to support models with FP32 weights only.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index a8db1a8..bf8b964 100644 (file)
@@ -9,7 +9,7 @@ Upon the start-up the sample application reads command line parameters and loads
 Engine plugin. When inference is done, the application creates an
 output image and outputs data to the standard output stream.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
index 0c062c6..f32ed55 100644 (file)
@@ -100,7 +100,6 @@ static std::map<std::string, std::string> parseConfig(const std::string &configN
 static std::size_t getNumberRequests(const std::string &plugin) {
     static const std::unordered_map<std::string, std::size_t> supported_plugins = {
         { "MYRIAD", 4   },
-        { "HDDL",   100 },
         { "FPGA",   3   },
     };
 
diff --git a/inference-engine/samples/sample_data/car.png b/inference-engine/samples/sample_data/car.png
new file mode 100644 (file)
index 0000000..f22d8d6
Binary files /dev/null and b/inference-engine/samples/sample_data/car.png differ
diff --git a/inference-engine/samples/sample_data/squeezenet1.1.labels b/inference-engine/samples/sample_data/squeezenet1.1.labels
new file mode 100644 (file)
index 0000000..a509c00
--- /dev/null
@@ -0,0 +1,1000 @@
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
index 4b7115a..a123f4c 100644 (file)
@@ -810,7 +810,7 @@ int main(int argc, char *argv[]) {
                                 inputFrame,
                                 inputBlob->byteSize());
 
-                    auto index = frameIndex - 2 * FLAGS_cw;
+                    int index = static_cast<int>(frameIndex) - 2 * FLAGS_cw;
                     inferRequest.inferRequest.StartAsync();
                     inferRequest.frameIndex = index < 0 ? -2 : index;
                     inferRequest.numFramesThisBatch = numFramesThisBatch;
index 7a033f8..dc5c125 100644 (file)
@@ -139,7 +139,7 @@ DEFINE_int32(bs, 1, batch_size_message);
 /// @brief Number of threads to use for inference on the CPU (also affects Hetero cases)
 DEFINE_int32(nthreads, 1, infer_num_threads_message);
 
-/// @brief Batch size (default 0)
+/// @brief Context window size (default 0)
 DEFINE_int32(cw, 0, context_window_message);
 
 /**
index a192a3c..f2e1c87 100644 (file)
@@ -5,7 +5,7 @@ inference of style transfer models.
 
 > **NOTE**: The OpenVINOâ„¢ toolkit does not include a pre-trained model to run the Neural Style Transfer sample. A public model from the [Zhaw's Neural Style Transfer repository](https://github.com/zhaw/neural_style) can be used. Read the [Converting a Style Transfer Model from MXNet*](./docs/MO_DG/prepare_model/convert_model/mxnet_specific/Convert_Style_Transfer_From_MXNet.md) topic from the [Model Optimizer Developer Guide](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) to learn about how to get the trained model and how to convert it to the Inference Engine format (\*.xml + \*.bin).
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Running
 
diff --git a/inference-engine/samples/thirdparty/gflags/doc/.nojekyll b/inference-engine/samples/thirdparty/gflags/doc/.nojekyll
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/inference-engine/samples/thirdparty/gflags/doc/designstyle.css b/inference-engine/samples/thirdparty/gflags/doc/designstyle.css
new file mode 100644 (file)
index 0000000..f5d1ec2
--- /dev/null
@@ -0,0 +1,115 @@
+body {
+  background-color: #ffffff;
+  color: black;
+  margin-right: 1in;
+  margin-left: 1in;
+}
+
+
+h1, h2, h3, h4, h5, h6 {
+  color: #3366ff;
+  font-family: sans-serif;
+}
+@media print {
+  /* Darker version for printing */
+  h1, h2, h3, h4, h5, h6 {
+    color: #000080;
+    font-family: helvetica, sans-serif;
+  }
+}
+
+h1 { 
+  text-align: center;
+  font-size: 18pt;
+}
+h2 {
+  margin-left: -0.5in;
+}
+h3 {
+  margin-left: -0.25in;
+}
+h4 {
+  margin-left: -0.125in;
+}
+hr {
+  margin-left: -1in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: right;
+}
+/* Use the <code> tag for bits of code and <var> for variables and objects. */
+code,pre,samp,var {
+  color: #006000;
+}
+/* Use the <file> tag for file and directory paths and names. */
+file {
+  color: #905050;
+  font-family: monospace;
+}
+/* Use the <kbd> tag for stuff the user should type. */
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+/*
+body:after {
+  content: "Google Confidential";
+}
+*/
+
+/* pretty printing styles.  See prettify.js */
+.str { color: #080; }
+.kwd { color: #008; }
+.com { color: #800; }
+.typ { color: #606; }
+.lit { color: #066; }
+.pun { color: #660; }
+.pln { color: #000; }
+.tag { color: #008; }
+.atn { color: #606; }
+.atv { color: #080; }
+pre.prettyprint { padding: 2px; border: 1px solid #888; }
+
+.embsrc { background: #eee; }
+
+@media print {
+  .str { color: #060; }
+  .kwd { color: #006; font-weight: bold; }
+  .com { color: #600; font-style: italic; }
+  .typ { color: #404; font-weight: bold; }
+  .lit { color: #044; }
+  .pun { color: #440; }
+  .pln { color: #000; }
+  .tag { color: #006; font-weight: bold; }
+  .atn { color: #404; }
+  .atv { color: #060; }
+}
+
+/* Table Column Headers */
+.hdr { 
+  color: #006; 
+  font-weight: bold; 
+  background-color: #dddddd; }
+.hdr2 { 
+  color: #006; 
+  background-color: #eeeeee; }
\ No newline at end of file
diff --git a/inference-engine/samples/thirdparty/gflags/doc/index.html b/inference-engine/samples/thirdparty/gflags/doc/index.html
new file mode 100644 (file)
index 0000000..e0afb47
--- /dev/null
@@ -0,0 +1,648 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+
+<html>
+<head>
+<title>How To Use Gflags (formerly Google Commandline Flags)</title>
+
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<link href="designstyle.css" type="text/css" rel="stylesheet">
+<style type="text/css">
+<!--
+  ol.bluelist li {
+    color: #3366ff;
+    font-family: sans-serif;
+  }
+  ol.bluelist li p {
+    color: #000;
+    font-family: "Times Roman", times, serif;
+  }
+  ul.blacklist li {
+    color: #000;
+    font-family: "Times Roman", times, serif;
+  }
+//-->
+</style>
+</head>
+
+<body>
+
+<h1>How To Use gflags (formerly Google Commandline Flags)</h1>
+<small>(as of
+<script type=text/javascript>
+  var lm = new Date(document.lastModified);
+  document.write(lm.toDateString());
+</script>)
+</small>
+<br>
+
+<blockquote><dl>
+  <dt> Table of contents </dt>
+  <dd> <a href="#intro">Introduction</a> </dd>
+  <dd> <a href="#download">Download and Installation</a> </dd>
+  <dd> <a href="#cmake">Declare dependency on gflags with CMake</a></dd>
+  <dd> <a href="#bazel">Declare dependency on gflags with Bazel</a></dd>
+  <dd> <a href="#define">DEFINE: Defining Flags In Program</A> </dd>
+  <dd> <a href="#using">Accessing the Flag</A> </dd>
+  <dd> <a href="#declare">DECLARE: Using the Flag in a Different File</a> </dd>
+  <dd> <a href="#validate">RegisterFlagValidator: Sanity-checking Flag Values</a> </dd>
+  <dd> <a href="#together">Putting It Together: How to Set Up Flags</a> </dd>
+  <dd> <a href="#commandline">Setting Flags on the Command Line</a> </dd>
+  <dd> <a href="#varz">Setting Flags at Runtime</a> </dd>
+  <dd> <a href="#default">Changing the Default Flag Value</a> </dd>
+  <dd> <a href="#special">Special Flags</a> </dd>
+  <dd> <a href="#api">The API</a> </dd>
+  <dd> <a href="#misc">Miscellaneous Notes</a> </dd>
+  <dd> <a href="#issues">Issues and Feature Requests</a> </dd>
+  <dd> <br/> </dd>
+</dl></blockquote>
+
+<h2> <A NAME=intro>Introduction, and Comparison to Other Commandline
+     Flags Libraries</A> </h2>
+
+<p><b>Commandline flags</b> are flags that users specify on the
+command line when they run an executable.  In the command</p>
+<pre>
+   fgrep -l -f /var/tmp/foo johannes brahms
+</pre>
+<p><code>-l</code> and <code>-f /var/tmp/foo</code> are the two
+commandline flags.  (<code>johannes</code> and <code>brahms</code>,
+which don't start with a dash, are <b>commandline arguments</b>.)</p>
+
+<p>Typically, an application lists what flags the user is allowed to
+pass in, and what arguments they take -- in this example,
+<code>-l</code> takes no argument, and <code>-f</code> takes a
+string (in particular, a filename) as an argument.  Users can use a
+library to help parse the commandline and store the flags in some data
+structure.</p>
+
+<p>Gflags, the commandline flags library used within Google,
+differs from other libraries,
+such as <code>getopt()</code>, in that flag definitions can be
+scattered around the source code, and not just listed in one place
+such as <code>main()</code>.  In practice, this means that a single
+source-code file will define and use flags that are meaningful to that
+file.  Any application that links in that file will get the flags, and
+the gflags library will automatically handle that
+flag appropriately.</p>
+
+<p>There's significant gain in flexibility, and ease of code reuse,
+due to this technique.  However, there is a danger that two files will
+define the same flag, and then give an error when they're linked
+together.</p>
+
+<p>The rest of this document describes how to use the commandlineflag
+library.  It's a C++ library, so examples are in C++.  However, there
+is a Python port with the same functionality, and this discussion
+translates directly to Python.</p>
+
+<h2> <A NAME=download>Download and Installation</A> </h2>
+
+<p>The gflags library can be downloaded from <A href="https://github.com/gflags/gflags">GitHub</A>.
+You can clone the project using the command:</p>
+<pre>
+   git clone https://github.com/gflags/gflags.git
+</pre>
+<p>Build and installation instructions are provided in the
+<A href="https://github.com/gflags/gflags/blob/master/INSTALL.md">INSTALL</A> file.
+The installation of the gflags package includes configuration files for popular build systems
+such as <A href="https://www.freedesktop.org/wiki/Software/pkg-config/">pkg-config</A>,
+<A href="#cmake">CMake</A>, and <A href="#bazel">Bazel</A>.</p>
+
+
+<h2> <A name=cmake>Declare dependency on gflags with CMake</A></h2>
+
+<p>Using gflags within a project which uses <A href="http://www.cmake.org">CMake</A> for its build system is easy.
+You can either require an external installation of the gflags package and find it using CMake's find_package
+command, or include the gflags project as subtree or submodule within your project's source tree and add the directory
+using CMake's add_subdirectory command.
+    
+<p>To use an external gflags installation, add the following CMake code to your <code>CMakeLists.txt</code> file.</p>
+
+<p>Find gflags installation. The <code>gflags_DIR</code> variable must be set to the &lt;prefix&gt;/lib/cmake/gflags directory
+containing the gflags-config.cmake file if &lt;prefix&gt; is a non-standard location. Otherwise, CMake should find
+the gflags installation automatically.</p>
+<pre>
+   find_package(gflags REQUIRED)
+</pre>
+<p>To request a particular imported gflags library target to link against, use the <code>COMPONENTS</code> option of
+the find_package command. For example, to force the use of the single-threaded static library, use the command</p>
+<pre>
+   find_package(gflags COMPONENTS nothreads_static)
+</pre>
+<p>Note that this will raise a fatal error when the installed gflags package does not contain the requested library.
+It is therefore recommended to only specify the particular component to look for if a specific library must be used.
+Otherwise, the gflags-config.cmake module will choose a suitable and available library for you. By default, the
+multi-threaded gflags library with shared linkage is chosen if available.</p>
+
+<p>When the source tree of the gflags project is included as subtree or submodule in the "gflags" directory of your project,
+replace the above find_package command by <code>add_subdirectory(gflags)</code>. See the top of the <code>gflags/CMakeLists.txt</code>
+file for a listing of available CMake variables that can be set before this command to configure the build of the
+gflags library. The default build settings are the build of a single-threaded static library which does not require
+any installation of the gflags subproject products.</p>
+
+<p>Finally, add your executable build target which uses gflags to parse the command arguments with dependency on the
+imported gflags library target:</p>
+<pre>
+   add_executable(foo main.cc)
+   target_link_libraries(foo gflags)
+</pre>
+
+<h2> <A name=bazel>Declare dependency on gflags with Bazel</A></h2>
+
+<p>To use gflags within a project which uses <A href="https://bazel.build/">Bazel</A> as build tool,
+add the following lines to your <code>WORKSPACE</code> file
+(see also Bazel documentation of <A href="https://www.bazel.io/versions/master/docs/be/workspace.html#git_repository">git_repository</A>):
+
+<pre>
+git_repository(
+    name   = "com_github_gflags_gflags",
+    commit = "&lt;INSERT COMMIT SHA HERE&gt;",
+    remote = "https://github.com/gflags/gflags.git",
+)
+
+bind(
+    name = "gflags",
+    actual = "@com_github_gflags_gflags//:gflags",
+)
+
+bind(
+    name = "gflags_nothreads",
+    actual = "@com_github_gflags_gflags//:gflags_nothreads",
+)
+</pre>
+
+<p>You can then add <code>//external:gflags</code> to the <code>deps</code> section of a <code>cc_binary</code>
+or <code>cc_library</code> rule, and <code>#include "gflags/gflags.h"</code> to include it in your source code.
+This use the shared gflags library with multi-threading enabled. In order to use the single-threaded shared
+gflags library, use the external dependency <code>//external:gflags_nothreads</code> instead.</p>
+
+<p>For example, see the following <code>BUILD</code> rule of the gflags/example project:</p>
+
+<pre>
+cc_binary(
+    name = "foo",
+    srcs = ["main.cc"],
+    deps = ["//external:gflags"],
+)
+</pre>
+
+<h2> <A name=define>DEFINE: Defining Flags In Program</A> </h2>
+
+<p> Defining a flag is easy: just use the appropriate macro for the
+type you want the flag to be, as defined at the bottom of
+<code>gflags/gflags.h</code>.  Here's an example file,
+<code>foo.cc</code>:</p>
+
+<pre>
+   #include &lt;gflags/gflags.h&gt;
+
+   DEFINE_bool(big_menu, true, "Include 'advanced' options in the menu listing");
+   DEFINE_string(languages, "english,french,german",
+                 "comma-separated list of languages to offer in the 'lang' menu");
+</pre>
+
+<p><code>DEFINE_bool</code> defines a boolean flag.  Here are the
+types supported:</p>
+<ul>
+  <li> <code>DEFINE_bool</code>: boolean
+  <li> <code>DEFINE_int32</code>: 32-bit integer
+  <li> <code>DEFINE_int64</code>: 64-bit integer
+  <li> <code>DEFINE_uint64</code>: unsigned 64-bit integer
+  <li> <code>DEFINE_double</code>: double
+  <li> <code>DEFINE_string</code>: C++ string
+</ul>
+
+<p>Note that there are no 'complex' types like lists: the "languages"
+flag in our example is a list of strings, but is defined of type
+"string", not "list_of_string" or similar.  This is by design.  We'd
+rather use only simple types for the flags, and allow for complex,
+arbitrary parsing routines to parse them, than to try to put the logic
+inside the flags library proper.</p>
+
+<p>All DEFINE macros take the same three arguments: the name of the
+flag, its default value, and a 'help' string that describes its use.
+The 'help' string is displayed when the user runs the application with
+the <A HREF="#special"><code>--help</code> flag</A>.</p>
+
+<p>You can define a flag in any source-code file in your executable.
+Only define a flag once!  If you want to access a flag in more than
+one source file, DEFINE it in one file, and <A
+HREF="#declare">DECLARE</A> it in the others.  Even better, DEFINE it
+in <code>foo.cc</code> and DECLARE it in <code>foo.h</code>; then
+everyone who <code>#includes foo.h</code> can use the flag.</p>
+
+<p>
+Defining flags in libraries rather than in main() is powerful, but
+does have some costs. One is that a library might not have a good
+default value for its flags, for example if the flag holds a
+filename that might not exist in some environments. To mitigate such problems,
+you can use <a href="#validate">flag validators</a> to ensure prompt
+notification (in the form of a crash) of an invalid flag value.
+</p>
+
+<p>Note that while most functions in this library are defined in the
+<code>google</code> namespace, <code>DEFINE_foo</code> (and
+<code>DECLARE_foo</code>, <A HREF="#declare">below</A>), should always
+be in the global namespace.</p>
+
+
+<h2> <A name=using>Accessing the Flag</A> </h2>
+
+<p>All defined flags are available to the program as just a normal
+variable, with the prefix <code>FLAGS_</code> prepended.  In the above
+example, the macros define two variables, <code>FLAGS_big_menu</code>
+(a bool), and <code>FLAGS_languages</code> (a C++ string).</p>
+
+<p>You can read and write to the flag just like any other
+variable:</p>
+<pre>
+   if (FLAGS_consider_made_up_languages)
+     FLAGS_languages += ",klingon";   // implied by --consider_made_up_languages
+   if (FLAGS_languages.find("finnish") != string::npos)
+     HandleFinnish();
+</pre>
+
+<p>You can also get and set flag values via special functions in
+<code>gflags.h</code>.  That's a rarer use case, though.</p>
+
+
+<h2> <A name=declare>DECLARE: Using the Flag in a Different File</A> </h2>
+
+<p>Accessing a flag in the manner of the previous section only works
+if the flag was <code>DEFINE</code>-ed at the top of the file.  If it
+wasn't, you'll get an 'unknown variable' error.</p>
+
+<p>The <code>DECLARE_type</code> macro is available when you want to
+use a flag that's defined in another file.  For instance, if I were
+writing <code>bar.cc</code> but wanted to access the big_menu, flag, I
+would put this near the top of <code>bar.cc</code>:</p>
+<pre>
+   DECLARE_bool(big_menu);
+</pre>
+
+<p>This is functionally equivalent to saying <code>extern
+FLAGS_big_menu</code>.</p>
+
+<p>Note that such an extern declaration introduces a dependency
+between your file and the file that defines the <code>big_menu</code>
+flag: <code>foo.cc</code>, in this case.  Such implicit dependencies
+can be difficult to manage in large projects.  For that reason we
+recommend the following guideline:</p>
+
+<blockquote>
+If you DEFINE a flag in <code>foo.cc</code>, either don't DECLARE it
+at all, only DECLARE it in tightly related tests, or only DECLARE
+it in <code>foo.h</code>.
+</blockquote>
+
+<p>You should go the do-not-DECLARE route when the flag is only needed
+by <code>foo.cc</code>, and not in any other file. If you want to
+modify the value of the flag in the related test file to see if it is
+functioning as expected, DECLARE it in the <code>foo_test.cc</code>
+file.
+
+<p>If the flag does span multiple files, DECLARE it in the associated
+<code>.h</code> file, and make others <code>#include</code> that
+<code>.h</code> file if they want to access the flag.  The
+<code>#include</code> will make explicit the dependency between the
+two files. This causes the flag to be a global variable.</p>
+
+
+<h2> <A name=validate>RegisterFlagValidator: Sanity-checking Flag Values</A> </h2>
+
+<p>After DEFINE-ing a flag, you may optionally register a validator
+function with the flag.  If you do this, after the flag is parsed from
+the commandline, and whenever its value is changed via a call to
+<code>SetCommandLineOption()</code>, the validator function is called
+with the new value as an argument.  The validator function should
+return 'true' if the flag value is valid, and false otherwise.
+If the function returns false for the new setting of the
+flag, the flag will retain its current value. If it returns false for the
+default value, ParseCommandLineFlags will die.
+
+<p>Here is an example use of this functionality:</p>
+<pre>
+static bool ValidatePort(const char* flagname, int32 value) {
+   if (value > 0 && value < 32768)   // value is ok
+     return true;
+   printf("Invalid value for --%s: %d\n", flagname, (int)value);
+   return false;
+}
+DEFINE_int32(port, 0, "What port to listen on");
+DEFINE_validator(port, &ValidatePort);
+</pre>
+
+<p>By doing the registration at global initialization time (right
+after the DEFINE_int32), we ensure that the registration happens before
+the commandline is parsed at the beginning of <code>main()</code>.</p>
+
+<p>The above used <code>DEFINE_validator</code> macro calls the
+<code>RegisterFlagValidator()</code> function which returns true if the
+registration is successful.  It returns false if the registration fails
+because a) the first argument does not refer to a commandline flag, or
+b) a different validator has already been registered for this flag.
+The return value is available as global static boolean variable named
+<code>&lt;flag&gt;_validator_registered</code>.</p>
+
+
+<h2> <A name=together>Putting It Together: How to Set Up Flags</A> </h2>
+
+<p>The final piece is the one that tells the executable to process the
+commandline flags, and set the <code>FLAGS_*</code> variables to the
+appropriate, non-default value based on what is seen on the
+commandline.  This is equivalent to the <code>getopt()</code> call in
+the getopt library, but has much less overhead to use.  In fact, it's
+just a single function call:</p>
+
+<pre>
+   gflags::ParseCommandLineFlags(&argc, &argv, true);
+</pre>
+
+<p>Usually, this code is at the beginning of <code>main()</code>.
+<code>argc</code> and <code>argv</code> are exactly as passed in to
+<code>main()</code>.  This routine might modify them, which is why
+pointers to them are passed in.</p>
+
+<p>The last argument is called "remove_flags".  If true, then
+<code>ParseCommandLineFlags</code> removes the flags and their
+arguments from <code>argv</code>, and modifies <code>argc</code>
+appropriately.  In this case, after the function call,
+<code>argv</code> will hold only commandline arguments, and not
+commandline flags.</p>
+
+<p>If, on the other hand, <code>remove_flags</code> is false, then
+<code>ParseCommandLineFlags</code> will leave argc unchanged, but will
+rearrange the arguments in argv so that the flags are all at the
+beginning.  For example, if the input is <code>"/bin/foo" "arg1" "-q"
+"arg2"</code> (which is legal but weird), the function will rearrange
+<code>argv</code> so it reads <code>"/bin/foo", "-q", "arg1",
+"arg2"</code>.  In this case, <code>ParseCommandLineFlags</code>
+returns the index into argv that holds the first commandline argument:
+that is, the index past the last flag.  (In this example, it would
+return 2, since <code>argv[2]</code> points to <code>arg1</code>.)</p>
+
+<p>In either case, the <code>FLAGS_*</code> variables are modified
+based on what was <A HREF="#commandline">passed in on the
+commandline</A>.</p>
+
+
+<h2> <A name=commandline>Setting Flags on the Command Line</A> </h2>
+
+<p>The reason you make something a flag instead of a compile-time
+constant, is so users can specify a non-default value on the
+commandline.  Here's how they might do it for an application that
+links in <code>foo.cc</code>:</p>
+<pre>
+   app_containing_foo --nobig_menu -languages="chinese,japanese,korean" ...
+</pre>
+
+<p>This sets <code>FLAGS_big_menu = false;</code> and
+<code>FLAGS_languages = "chinese,japanese,korean"</code>, when
+<code>ParseCommandLineFlags</code> is run.</p>
+
+<p>Note the atypical syntax for setting a boolean flag to false:
+putting "no" in front of its name.  There's a fair bit of flexibility
+to how flags may be specified.  Here's an example of all the ways to
+specify the "languages" flag:</p>
+<ul>
+  <li> <code>app_containing_foo --languages="chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo -languages="chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo --languages "chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo -languages "chinese,japanese,korean"</code>
+</ul>
+
+<p>For boolean flags, the possibilities are slightly different:</p>
+<ul>
+  <li> <code>app_containing_foo --big_menu</code>
+  <li> <code>app_containing_foo --nobig_menu</code>
+  <li> <code>app_containing_foo --big_menu=true</code>
+  <li> <code>app_containing_foo --big_menu=false</code>
+</ul>
+<p>(as well as the single-dash variant on all of these).</p>
+
+<p>Despite this flexibility, we recommend using only a single form:
+<code>--variable=value</code> for non-boolean flags, and
+<code>--variable/--novariable</code> for boolean flags.  This
+consistency will make your code more readable, and is also the format
+required for certain special-use cases like <A
+HREF="#flagfiles">flagfiles</A>.</p>
+
+<p>It is a fatal error to specify a flag on the commandline that has
+not been DEFINED somewhere in the executable.  If you need that
+functionality for some reason -- say you want to use the same set of
+flags for several executables, but not all of them DEFINE every flag
+in your list -- you can specify <A
+HREF="#special"><code>--undefok</code></A> to suppress the error.</p>
+
+<p>As in getopt(), <code>--</code> by itself will terminate flags
+processing.  So in <code>foo -f1 1 -- -f2 2</code>, <code>f1</code> is
+considered a flag, but <code>-f2</code> is not.</p>
+
+<p>If a flag is specified more than once, only the last specification
+is used; the others are ignored.</p>
+
+<p>Note that flags do not have single-letter synonyms, like they do in
+the getopt library, nor do we allow "combining" flags behind a
+single dash, as in <code>ls -la</code>.</p>
+
+
+
+<h2> <A name=default>Changing the Default Flag Value</A> </h2>
+
+<p>Sometimes a flag is defined in a library, and you want to change
+its default value in one application but not others.  It's simple to
+do this: just assign a new value to the flag in <code>main()</code>,
+before calling <code>ParseCommandLineFlags()</code>:</p>
+<pre>
+   DECLARE_bool(lib_verbose);   // mylib has a lib_verbose flag, default is false
+   int main(int argc, char** argv) {
+     FLAGS_lib_verbose = true;  // in my app, I want a verbose lib by default
+     ParseCommandLineFlags(...);
+   }
+</pre>
+
+<p>For this application, users can still set the flag value on the
+commandline, but if they do not, the flag's value will default to
+true.</p>
+
+
+<h2> <A name="special">Special Flags</a> </h2>
+
+<p>There are a few flags defined by the commandlineflags module
+itself, and are available to all applications that use
+commandlineflags.  These fall into
+three categories.  First are the 'reporting' flags that, when found, cause
+the application to print some information about itself and exit.</p>
+
+<table><tr valign=top>
+  <td><code>--help</code></td>
+  <td>shows all flags from all files, sorted by file and then by name;
+      shows the flagname, its default value, and its help string</td>
+</tr><tr valign=top>
+  <td><code>--helpfull</code></td>
+  <td>same as -help, but unambiguously asks for all flags
+     (in case -help changes in the future)</td>
+</tr><tr valign=top>
+  <td><code>--helpshort</code></td>
+  <td>shows only flags for the file with the same name as the executable 
+      (usually the one containing <code>main()</code>)</td>
+</tr><tr valign=top>
+  <td><code>--helpxml</code></td>
+  <td>like --help, but output is in xml for easier parsing</td>
+</tr><tr valign=top>
+  <td><code>--helpon=FILE &nbsp;</code></td>
+  <td>shows only flags defined in FILE.*</td>
+</tr><tr valign=top>
+  <td><code>--helpmatch=S</code></td>
+  <td>shows only flags defined in *S*.*</td>
+</tr><tr valign=top>
+  <td><code>--helppackage</code></td>
+  <td>shows flags defined in files in same directory as <code>main()</code></td>
+</tr><tr valign=top>
+  <td><code>--version</code></td>
+  <td>prints version info for the executable</td>
+</tr></table>
+
+<p>Second are the flags that affect how other flags are parsed.</p>
+
+<table><tr valign=top>
+  <td><code>--undefok=flagname,flagname,...</code></td>
+  <td>for those names listed as the argument to <code>--undefok</code>,
+      suppress the normal error-exit that occurs when
+      <code>--name</code> is seen on the commandline, but
+      <code>name</code> has not been DEFINED anywhere in the
+      application
+</table>
+
+<p>Third are the 'recursive' flags, that cause other flag values to be
+set: <code>--fromenv</code>, <code>--tryfromenv</code>,
+<code>--flagfile</code>.  These are described below in more
+detail.</p>
+
+<h3> <code>--fromenv</code> </h3>
+
+<p><code>--fromenv=foo,bar</code> says to read the values for the
+<code>foo</code> and <code>bar</code> flags from the environment.
+In concert with this flag, you must actually set the values in the
+environment, via a line like one of the two below:</p>
+<pre>
+   export FLAGS_foo=xxx; export FLAGS_bar=yyy   # sh
+   setenv FLAGS_foo xxx; setenv FLAGS_bar yyy   # tcsh
+</pre>
+<p>This is equivalent to specifying <code>--foo=xxx</code>,
+<code>--bar=yyy</code> on the commandline.</p>
+
+<p>Note it is a fatal error to say <code>--fromenv=foo</code> if
+<code>foo</code> is not DEFINED somewhere in the application.  (Though
+you can suppress this error via <code>--undefok=foo</code>, just like
+for any other flag.)</p>
+
+<p>It is also a fatal error to say <code>--fromenv=foo</code> if
+<code>FLAGS_foo</code> is not actually defined in the environment.</p>
+
+<h3> <code>--tryfromenv</code> </h3>
+
+<p><code>--tryfromenv</code> is exactly like <code>--fromenv</code>,
+except it is <b>not</b> a fatal error to say
+<code>--tryfromenv=foo</code> if <code>FLAGS_foo</code> is not
+actually defined in the environment.  Instead, in such cases,
+<code>FLAGS_foo</code> just keeps its default value as specified in
+the application.</p>
+
+<p>Note it is still an error to say <code>--tryfromenv=foo</code> if
+<code>foo</code> is not DEFINED somewhere in the application.</p>
+
+<h3> <code>--flagfile</code> </h3>
+
+<p><code>--flagfile=f</code> tells the commandlineflags module to read
+the file <code>f</code>, and to run all the flag-assignments found in
+that file as if these flags had been specified on the commandline.</p>
+
+<p>In its simplest form, <code>f</code> should just be a list of flag
+assignments, one per line.  Unlike on the commandline, the equals sign
+separating a flagname from its argument is <i>required</i> for
+flagfiles.  An example flagfile, <code>/tmp/myflags</code>:</p>
+<pre>
+--nobig_menus
+--languages=english,french
+</pre>
+
+<p>With this flagfile, the following two lines are equivalent:<p>
+<pre>
+   ./myapp --foo --nobig_menus --languages=english,french --bar
+   ./myapp --foo --flagfile=/tmp/myflags --bar
+</pre>
+
+<p>Note that many errors are silently suppressed in flagfiles.  In
+particular, unrecognized flagnames are silently ignored, as are flags
+that are missing a required value (e.g., a flagfile that just says
+<code>--languages</code>).</p>
+
+<p>The general format of a flagfile is a bit more complicated than the
+simple, common case above.  It is: a sequence of filenames, one per
+line, followed by a sequence of flags, one per line, repeated as many
+times as desired.  Filenames in a flagfile can use wildcards
+(<code>*</code> and <code>?</code>), and the sequence of flags located
+after a sequence of filenames is processed only if the current
+executable's name matches one of the filenames.  It is possible to
+start the flagfile with a sequence of flags instead of a sequence of
+filenames; if such a sequence of flags is present, these flags are
+applied to the current executable no matter what it is.</p>
+
+<p>Lines that start with a <code>#</code> are ignored as comments.
+Leading whitespace is also ignored in flagfiles, as are blank
+lines.</p>
+
+<p>It is possible for a flagfile to use the <code>--flagfile</code>
+flag to include another flagfile.</p>
+
+<p>Flags are always processed in the expected order.  That is,
+processing begins by examining the flags specified directly on the
+command line.  If a flagfile is specified, its contents are processed,
+and then processing continues with remaining flags from the command
+line.</p>
+
+
+<h2> <A name="api">The API</a> </h2>
+
+<p>In addition to accessing <code>FLAGS_foo</code> directly, it is
+possible to access the flags programmatically, through an API.  It is
+also possible to access information about a flag, such as its default
+value and help-string.  A <code>FlagSaver</code> makes it easy to
+modify flags and then automatically undo the modifications later.
+Finally, there are somewhat unrelated, but useful, routines to easily
+access parts of <code>argv</code> outside main, including the program
+name (<code>argv[0]</code>).</p>
+
+<p>For more information about these routines, and other useful helper
+methods such as <code>gflags::SetUsageMessage()</code> and
+<code>gflags::SetVersionString</code>, see <code>gflags.h</code>.</p>
+
+
+<h2> <A name="misc">Miscellaneous Notes</code> </h2>
+
+<p>If your application has code like this:</p>
+<pre>
+   #define STRIP_FLAG_HELP 1    // this must go before the #include!
+   #include &lt;gflags/gflags.h&gt;
+</pre>
+<p>we will remove the help messages from the compiled source. This can
+reduce the size of the resulting binary somewhat, and may also be
+useful for security reasons.</p>
+
+<h2> <A name="issues">Issues and Feature Requests</code> </h2>
+
+<p>Please report any issues or ideas for additional features on <A href="https://github.com/gflags/gflags/issues">GitHub</A>.
+We would also like to encourage <A href="https://github.com/gflags/gflags/pulls">pull requests</A> for bug fixes and implementations of new features.</p>
+
+<hr>
+<address>
+Craig Silverstein, Andreas Schuh<br>
+<script type=text/javascript>
+  var lm = new Date(document.lastModified);
+  document.write(lm.toDateString());
+</script>
+</address>
+
+</body>
+</html>
index dc6e941..042150b 100644 (file)
@@ -15,7 +15,7 @@ Possible use cases of the tool:
 * Use Validation Application as another sample: although the code is much more complex than in classification and object
   detection samples, the source code is open and can be re-used.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 
 ## Validation Application Options
 
@@ -59,7 +59,7 @@ The tool options are divided into two categories:
 
 ## General Workflow
 
-> **NOTE**: By default, Inference Engine samples expect input images to have BGR channels order. If you trained you model to work with images in RGB order, you need to manually rearrange the default channels order in the sample application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to [When to Specify Input Shapes](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md#when_to_reverse_input_channels).
+> **NOTE**: By default, Inference Engine samples expect input images to have BGR channels order. If you trained you model to work with images in RGB order, you need to manually rearrange the default channels order in the sample application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to [When to Reverse Input Channels](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md#when_to_reverse_input_channels).
 
 When executed, the Validation Application perform the following steps:
 
index b977b63..0a87426 100644 (file)
@@ -23,7 +23,7 @@ int getLoadModeForChannels(int channels, int base) {
     case 3:
         return base | IMREAD_COLOR;
     }
-    return base | IMREAD_UNCHANGED;
+    return IMREAD_UNCHANGED;
 }
 
 template <class T>
index 23137de..aab215e 100644 (file)
@@ -360,6 +360,12 @@ int main(int argc, char *argv[]) {
             showUsage();
             return ex.list().begin()->exitCode();
         }
+    } catch (const std::exception& ex) {
+        slog::err << ex.what() << slog::endl;
+        return 1;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return 1;
     }
     return 0;
 }
index aad2b5b..1d68d56 100644 (file)
@@ -18,6 +18,10 @@ if(ENABLE_CLDNN)
     add_subdirectory(cldnn_engine)
 endif()
 
+if(ENABLE_VPU)
+  add_subdirectory(vpu)
+endif()
+
 if (ENABLE_GNA)
     add_subdirectory(gna_plugin)
 endif()
index 9f8f58b..d9164b2 100644 (file)
@@ -222,12 +222,16 @@ void CLDNNGraph::Config::LoadFromMap(const std::map<std::string, std::string>& c
         } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_GRAPH_DUMPS_DIR) == 0) {
             if (!val.empty()) {
                 graph_dumps_dir = val;
-                mkdir(graph_dumps_dir.c_str(), 0755);
+                if (mkdir(graph_dumps_dir.c_str(), 0755) != 0) {
+                    THROW_IE_EXCEPTION << "Couldn't create clDNN graph dump directory!";
+                }
             }
         } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_SOURCES_DUMPS_DIR) == 0) {
             if (!val.empty()) {
                 sources_dumps_dir = val;
-                mkdir(sources_dumps_dir.c_str(), 0755);
+                if (mkdir(sources_dumps_dir.c_str(), 0755) != 0) {
+                    THROW_IE_EXCEPTION << "Couldn't create clDNN source dump directory!";
+                }
             }
         } else if (key.compare(PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS) == 0) {
             if (val.compare(PluginConfigParams::YES) == 0) {
@@ -310,7 +314,7 @@ CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& conf
     }
 
     bool res = !NetPass::CombineRNNSeq(network) ? NetPass::UnrollTI(network) : true;
-    res &= NetPass::UnrollRNN_if(network, [] (RNNCellBase rnn) -> bool {
+    res &= NetPass::UnrollRNN_if(network, [] (const RNNCellBase& rnn) -> bool {
         if (rnn.clip != 0.0f)
             return true;
         if (rnn.type == "GRUCell" ||
@@ -386,6 +390,15 @@ CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& conf
     m_env.debugOptions.ClearTimedEvents();
 }
 
+template<typename LayerTypePtr>
+LayerTypePtr as(const CNNLayerPtr& in_ptr) {
+    auto result_ptr = dynamic_cast<LayerTypePtr> (in_ptr.get());
+    if (nullptr == result_ptr) {
+        THROW_IE_EXCEPTION << "CNNLayerPtr is not suitable for casting to requested layer type";
+    }
+    return result_ptr;
+}
+
 inline std::string layer_type_name_ID(InferenceEngine::CNNLayer* layer) {
     return layer->type + ":" + layer->name;
 }
@@ -683,7 +696,7 @@ cldnn::concatenation::concatenation_axis CLDNNGraph::ConcatAxisFromIEAxis(unsign
 
 void CLDNNGraph::CreatePrimitiveFromBlob(cldnn::primitive_id primID,
                                          const InferenceEngine::Blob::Ptr pBlob,
-                                         cldnn::layout blobLayout,
+                                         const cldnn::layout& blobLayout,
                                          size_t blobByteOffset,
                                          WeightRearrangeType rearrange) {
     auto mem = cldnn::memory::allocate(*(m_env.engine), blobLayout);
@@ -765,7 +778,7 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt
 
     switch (LayerTypeFromStr(layer->type)) {
     case Convolution: {
-        auto convLayer = dynamic_cast<InferenceEngine::ConvolutionLayer *> (layer.get());
+        auto convLayer = as<InferenceEngine::ConvolutionLayer *> (layer);
         if ((inFeatures % groupSize) || (convLayer->_out_depth % groupSize)) {
             THROW_CLDNN_EXCEPTION("Invalid group size in layer " << convLayer->name);
         }
@@ -784,7 +797,7 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt
     }
         break;
     case Deconvolution: {
-        auto deconvLayer = dynamic_cast<InferenceEngine::DeconvolutionLayer *> (layer.get());
+        auto deconvLayer = as<InferenceEngine::DeconvolutionLayer *> (layer);
         if ((inFeatures % groupSize) || (deconvLayer->_out_depth % groupSize)) {
             THROW_CLDNN_EXCEPTION("Invalid group size in layer " << deconvLayer->name);
         }
@@ -1044,7 +1057,7 @@ void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer)
 void CLDNNGraph::CreateScaleShiftPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto scaleShiftLayer = dynamic_cast<InferenceEngine::ScaleShiftLayer*> (layer.get());
+    auto scaleShiftLayer = as<InferenceEngine::ScaleShiftLayer*> (layer);
 
     // create scales and biases
     cldnn::primitive_id scalePrimID = scaleShiftLayer->name + m_scalesTag;
@@ -1085,7 +1098,7 @@ void CLDNNGraph::CreateScaleShiftPrimitive(InferenceEngine::CNNLayerPtr &layer)
 
 void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) {
     ValidateLayer(layer, 3);
-    auto proposalLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto proposalLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     float nms_thresh = proposalLayer->GetParamAsFloat("nms_thresh", 0.7f);
     int min_size = proposalLayer->GetParamAsInt("min_size", 16);
@@ -1157,7 +1170,7 @@ void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) {
 void CLDNNGraph::CreatePReLUPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto preluLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto preluLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     std::string preluLayerName = layer_type_name_ID(layer);
     auto inDataPtr = preluLayer->insData[0].lock();
@@ -1207,7 +1220,7 @@ void CLDNNGraph::CreateBatchNormalizationPrimitive(InferenceEngine::CNNLayerPtr
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
     std::string bnLayerName = layer_type_name_ID(layer);
 
-    auto bnLayer = dynamic_cast<InferenceEngine::BatchNormalizationLayer *> (layer.get());
+    auto bnLayer = as<InferenceEngine::BatchNormalizationLayer *> (layer);
     cldnn::primitive_id weightID = bnLayerName + "_" + m_scalesTag;
     cldnn::primitive_id biasID = bnLayerName + "_" + m_biasesTag;
 
@@ -1222,8 +1235,7 @@ void CLDNNGraph::CreateBatchNormalizationPrimitive(InferenceEngine::CNNLayerPtr
     m_topology->add(scalePrim);
     m_env.profilingIDs.push_back(bnLayerName);
     return;
-#endif  // _SCALE_BN_OPT
-
+#else
     cldnn::tensor blobTensor(0);
     switch (bnLayer->outData[0]->dims.size()) {
     case 2:
@@ -1258,12 +1270,13 @@ void CLDNNGraph::CreateBatchNormalizationPrimitive(InferenceEngine::CNNLayerPtr
     m_env.primitiveIDs[bnLayerName] = bnLayerName;
     m_topology->add(bnPrim);
     m_env.profilingIDs.push_back(bnLayerName);
+#endif  // _SCALE_BN_OPT
 }
 
 void CLDNNGraph::CreateFlattenPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto flattenLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto flattenLayer = as<InferenceEngine::GenericLayer*> (layer);
     std::string flattenLayerName = layer_type_name_ID(layer);
 
     auto flattenPrim = cldnn::reshape(
@@ -1279,7 +1292,7 @@ void CLDNNGraph::CreateFlattenPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreatePermutePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto permuteLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto permuteLayer = as<InferenceEngine::GenericLayer*> (layer);
     std::vector<uint16_t> ie_order;
     for (auto& a : permuteLayer->GetParamAsInts("order"))
         ie_order.push_back(static_cast<uint16_t>(a));
@@ -1320,7 +1333,7 @@ void CLDNNGraph::CreatePermutePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateReshapePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto reshapeLayer = dynamic_cast<InferenceEngine::ReshapeLayer*> (layer.get());
+    auto reshapeLayer = as<InferenceEngine::ReshapeLayer*> (layer);
     IE_ASSERT(reshapeLayer->outData.size());
     std::string reshapeLayerName = layer_type_name_ID(layer);
 
@@ -1337,7 +1350,7 @@ void CLDNNGraph::CreateReshapePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateNormalizePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto normLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto normLayer = as<InferenceEngine::GenericLayer*> (layer);
     ValidateGenericLayerBlobs(normLayer, { "weights" });
     CreateGenericLayerBlobPrimitives(normLayer);
 
@@ -1365,7 +1378,7 @@ void CLDNNGraph::CreateNormalizePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 
 void CLDNNGraph::CreateDetectionOutputPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 3);
-    auto detectionLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto detectionLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     uint32_t num_classes            = detectionLayer->GetParamAsUInt("num_classes", 1);
     bool share_location             = detectionLayer->GetParamsAsBool("share_location", true);
@@ -1421,7 +1434,7 @@ void CLDNNGraph::CreateDetectionOutputPrimitive(InferenceEngine::CNNLayerPtr &la
 
 void CLDNNGraph::CreatePriorBoxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 2);
-    auto priorBoxLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto priorBoxLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     // params
     std::vector<float> min_size = priorBoxLayer->GetParamAsFloats("min_size");
@@ -1491,7 +1504,7 @@ void CLDNNGraph::CreatePriorBoxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateDeconvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto deconvLayer = dynamic_cast<InferenceEngine::DeconvolutionLayer *> (layer.get());
+    auto deconvLayer = as<InferenceEngine::DeconvolutionLayer *> (layer);
 
     if (deconvLayer->_dilation[X_AXIS] != 1 || deconvLayer->_dilation[Y_AXIS] != 1) {
         THROW_CLDNN_EXCEPTION("Unsupported dilation in deconvolution " << layer->name);
@@ -1544,7 +1557,7 @@ void CLDNNGraph::CreateCropPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         THROW_CLDNN_EXCEPTION("Unsupported fuse in layer: " << layer->name << " with: " << layer->_fusedWith->name);
     }
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto cropLayer = dynamic_cast<InferenceEngine::CropLayer*> (layer.get());
+    auto cropLayer = as<InferenceEngine::CropLayer*> (layer);
     IE_ASSERT(cropLayer->axis.size() == cropLayer->offset.size());
     // IE_ASSERT(cropLayer->outData[0] && cropLayer->outData[0]->dims.size() == 4);
 
@@ -1582,7 +1595,7 @@ void CLDNNGraph::CreateCropPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 
 void CLDNNGraph::CreateROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 2);
-    auto roiPoolingLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto roiPoolingLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     // params
     int pooled_width = roiPoolingLayer->GetParamAsInt("pooled_w", 0);
@@ -1613,7 +1626,7 @@ void CLDNNGraph::CreateROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer)
 
 void CLDNNGraph::CreatePSROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 2);
-    auto psROIPoolingLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto psROIPoolingLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     // params
     int group_size = psROIPoolingLayer->GetParamAsInt("group_size");
@@ -1650,7 +1663,7 @@ void CLDNNGraph::CreatePSROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer
 void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer, CLDNNCustomLayerPtr customLayer) {
     ValidateLayer(layer, 0);
     // todo: handling fusing
-    auto genericLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto genericLayer = as<InferenceEngine::GenericLayer*> (layer);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
 
     // Handle defines
@@ -1678,10 +1691,10 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer
         if (blob.second->dims().size() != 1) {
             THROW_CLDNN_EXCEPTION("Invalid dimensions for blob " << blob.first << " in layer " << genericLayer->name);
         }
-        CreatePrimitiveFromBlob(blobId, blob.second, cldnn::layout(
-            DataTypeFromPrecision(blob.second->precision()),
-            m_defaultFormat,
-            cldnn::tensor(1, 1, TensorValue(blob.second->dims()[0]), 1)));
+        cldnn::layout genericBlobLayout(DataTypeFromPrecision(blob.second->precision()),
+                                        m_defaultFormat,
+                                        cldnn::tensor(1, 1, TensorValue(blob.second->dims()[0]), 1));
+        CreatePrimitiveFromBlob(blobId, blob.second, genericBlobLayout);
         // save index in blobIndex
         blobIndex[blob.first] = reorderedInputs.size();
         // add to reorderedInputs
@@ -1838,7 +1851,7 @@ void CLDNNGraph::CreateSimplerNMSPrimitive(InferenceEngine::CNNLayerPtr &layer)
     ValidateLayer(layer, 3);
     IE_ASSERT(layer->insData[0].lock()->dims[3] == 1);  // only handling input batch size 1
     IE_ASSERT(layer->insData[1].lock()->dims[3] == 1);  // only handling input batch size 1
-    auto simpleNMSLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto simpleNMSLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     int max_num_proposals = simpleNMSLayer->GetParamAsInt("max_num_proposals");
     float iou_threshold = simpleNMSLayer->GetParamAsFloat("iou_threshold", 0.7f);
@@ -1872,7 +1885,7 @@ void CLDNNGraph::CreateSimplerNMSPrimitive(InferenceEngine::CNNLayerPtr &layer)
 void CLDNNGraph::CreateEltwisePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateEltwiseLayer(layer);
 
-    auto eltwiseLayer = dynamic_cast<InferenceEngine::EltwiseLayer *> (layer.get());
+    auto eltwiseLayer = as<InferenceEngine::EltwiseLayer *> (layer);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
 
     std::vector<float> coefficients = eltwiseLayer->coeff;
@@ -1897,7 +1910,7 @@ void CLDNNGraph::CreateEltwisePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 
 void CLDNNGraph::CreateConcatenatePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 0);
-    auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
+    auto concatLayer = as<InferenceEngine::ConcatLayer *> (layer);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
     std::string concatLayerName = layer_type_name_ID(layer);
     auto concatPrim = cldnn::concatenation(
@@ -1911,7 +1924,7 @@ void CLDNNGraph::CreateConcatenatePrimitive(InferenceEngine::CNNLayerPtr &layer)
 
 void CLDNNGraph::CreateSplitPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
-    auto splitLayer = dynamic_cast<InferenceEngine::SplitLayer *> (layer.get());
+    auto splitLayer = as<InferenceEngine::SplitLayer *> (layer);
     if (IsValidSplitConvMerge(splitLayer)) {
         // AlextNet style split->conv*2->merge
         CreateFusedSplitConvMergePrimitive(layer);
@@ -2014,16 +2027,15 @@ std::cout << "Splitting layer: " << layer->name << "\n\tSize:" << CldnnTensorFro
 void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
     // only handle the split->conv->merge topology for now
-    auto splitLayer = dynamic_cast<InferenceEngine::SplitLayer *> (layer.get());
+    auto splitLayer = as<InferenceEngine::SplitLayer *> (layer);
     IE_ASSERT(IsValidSplitConvMerge(splitLayer));
 
     auto convLayer1 =
-        dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[0]).get());
+        as<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[0]));
     auto convLayer2 =
-        dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[1]).get());
+        as<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[1]));
     auto concatLayer =
-        dynamic_cast<InferenceEngine::ConcatLayer *> (GetNextSingleLayer(
-            GetNextSingleLayer(splitLayer->outData[0])).get());
+        as<InferenceEngine::ConcatLayer *> (GetNextSingleLayer(GetNextSingleLayer(splitLayer->outData[0])));
 
     if (convLayer1 == nullptr ||
         convLayer2 == nullptr ||
@@ -2078,7 +2090,7 @@ void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr
 void CLDNNGraph::CreatePowerPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto powerLayer = dynamic_cast<InferenceEngine::PowerLayer *> (layer.get());
+    auto powerLayer = as<InferenceEngine::PowerLayer *> (layer);
     if (powerLayer->power != 1.0f && powerLayer->power != 0.5f) {
         THROW_CLDNN_EXCEPTION("Power Layer " << layer->name << "uses unsupported power value");
     }
@@ -2130,7 +2142,7 @@ void CLDNNGraph::CreatePowerPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateSoftMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto softmaxLayer = dynamic_cast<InferenceEngine::SoftMaxLayer *> (layer.get());
+    auto softmaxLayer = as<InferenceEngine::SoftMaxLayer *> (layer);
 
     // additional WA for clDNN FullyConnected output in BX instead of BF
     int inputOrder = 0;
@@ -2157,17 +2169,16 @@ void CLDNNGraph::CreateSoftMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto fcLayer = dynamic_cast<InferenceEngine::FullyConnectedLayer *> (layer.get());
+    auto fcLayer = as<InferenceEngine::FullyConnectedLayer *> (layer);
 
     std::string fcLayerName = layer_type_name_ID(layer);
     // create bias primitive
     cldnn::primitive_id biasesPrimID = "";
     if (fcLayer->_biases != nullptr) {
         biasesPrimID = fcLayerName + m_biasesTag;
-        CreatePrimitiveFromBlob(biasesPrimID,
-            fcLayer->_biases,
-            cldnn::layout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat,
-                cldnn::spatial(TensorValue(fcLayer->_out_num))));
+        cldnn::layout fcbLayout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat,
+            cldnn::spatial(TensorValue(fcLayer->_out_num)));
+        CreatePrimitiveFromBlob(biasesPrimID, fcLayer->_biases, fcbLayout);
     }
 
     // create weights primitive
@@ -2188,9 +2199,8 @@ void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &lay
         break;
     default: THROW_CLDNN_EXCEPTION("Invalid data dimensions");
     }
-    CreatePrimitiveFromBlob(weightsPrimID,
-                            fcLayer->_weights,
-                            cldnn::layout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat, weightsDims));
+    cldnn::layout fcwLayout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat, weightsDims);
+    CreatePrimitiveFromBlob(weightsPrimID, fcLayer->_weights, fcwLayout);
 
     auto fcPrim = cldnn::fully_connected(fcLayerName,
                                          inputPrimitives[0],
@@ -2207,7 +2217,7 @@ void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &lay
 void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto poolLayer = dynamic_cast<InferenceEngine::PoolingLayer *> (layer.get());
+    auto poolLayer = as<InferenceEngine::PoolingLayer *> (layer);
 
     std::string poolLayerName = layer_type_name_ID(layer);
     auto allPads = getPaddings(*poolLayer);
@@ -2293,7 +2303,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateLRNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto lrnLayer = dynamic_cast<InferenceEngine::NormLayer *> (layer.get());
+    auto lrnLayer = as<InferenceEngine::NormLayer *> (layer);
     std::string lrnLayerName = layer_type_name_ID(layer);
     auto lrnPrim = cldnn::lrn(
         lrnLayerName,
@@ -2403,7 +2413,7 @@ void CLDNNGraph::CreateActivationPrimitive(InferenceEngine::CNNLayerPtr &layer,
 void CLDNNGraph::CreateCopyPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto copyLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto copyLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     // Optimize out and just update references
     std::string layerName = layer_type_name_ID(layer);
@@ -2415,7 +2425,7 @@ void CLDNNGraph::CreateUpsamplingPrimitive(InferenceEngine::CNNLayerPtr &layer)
     // Assuming multi-input will be handled by prev concat/eltwise layers
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto upsamplingLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto upsamplingLayer = as<InferenceEngine::GenericLayer*> (layer);
     uint32_t scale = upsamplingLayer->GetParamAsUInt("scale");
     uint32_t numFilter = upsamplingLayer->GetParamAsUInt("num_filter");
     std::string sampleType = upsamplingLayer->GetParamAsString("sample_type");
@@ -2436,7 +2446,7 @@ void CLDNNGraph::CreateUpsamplingPrimitive(InferenceEngine::CNNLayerPtr &layer)
 void CLDNNGraph::CreateResamplePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto resampleLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto resampleLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     auto outDims = layer->outData[0]->dims;
     size_t inFeatures = 1;
@@ -2472,7 +2482,7 @@ void CLDNNGraph::CreateResamplePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateYOLO2RegionPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto YOLOregionLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto YOLOregionLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     uint32_t coords = YOLOregionLayer->GetParamAsUInt("coords", 4);
     uint32_t classes = YOLOregionLayer->GetParamAsUInt("classes", 20);
@@ -2503,7 +2513,7 @@ void CLDNNGraph::CreateYOLO2RegionPrimitive(InferenceEngine::CNNLayerPtr &layer)
 void CLDNNGraph::CreateYOLO2ReorgPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto YOLOreorgLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto YOLOreorgLayer = as<InferenceEngine::GenericLayer*> (layer);  // as<InferenceEngine::GenericLayer*> (layer);
     uint32_t stride = YOLOreorgLayer->GetParamAsUInt("stride");
 
     std::string YOLOreorgLayerName = layer_type_name_ID(layer);
@@ -2520,7 +2530,7 @@ void CLDNNGraph::CreateYOLO2ReorgPrimitive(InferenceEngine::CNNLayerPtr &layer)
 void CLDNNGraph::CreateArgMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto ArgMaxLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto ArgMaxLayer = as<InferenceEngine::GenericLayer*> (layer);
     const cldnn::arg_max_min::out_type otype = cldnn::arg_max_min::out_type::max;
 
     if (HasParam(ArgMaxLayer->params, "out_max_val")) {
@@ -2565,7 +2575,7 @@ void CLDNNGraph::CreateArgMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateMaxUnpoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 2);
 
-    auto UnpoolingLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto UnpoolingLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     cldnn::primitive_id real_input, argmax_mutable;
 
@@ -2610,7 +2620,7 @@ void CLDNNGraph::CreateMaxUnpoolingPrimitive(InferenceEngine::CNNLayerPtr &layer
 void CLDNNGraph::CreateMVNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto MvnLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto MvnLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     bool across_channels = MvnLayer->GetParamsAsBool("across_channels", false);
     bool normalize_variance = MvnLayer->GetParamsAsBool("normalize_variance", true);
@@ -2632,7 +2642,7 @@ void CLDNNGraph::CreateMVNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto tileLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto tileLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     int axis = tileLayer->GetParamAsInt("axis", 1);
     int tiles = tileLayer->GetParamAsInt("tiles");
@@ -2661,7 +2671,7 @@ void CLDNNGraph::CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreatePadPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto padLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto padLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     auto PadTensorFromArgs = [](const std::string &s) -> cldnn::tensor {
         std::stringstream ss(s);
@@ -2731,7 +2741,7 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     cldnn::primitive_id weightID = layerName + m_weightsTag;
     cldnn::primitive_id recurrentID = layerName + "_recurrent" + m_weightsTag;
     cldnn::primitive_id biasID = layerName + m_biasesTag;
-    auto cellLayer = dynamic_cast<InferenceEngine::LSTMCell*> (layer.get());
+    auto cellLayer = as<InferenceEngine::LSTMCell*> (layer);
 
     /* check incoming CNN layer and setup required variables */
     {
@@ -2779,7 +2789,7 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         auto rmem = cldnn::memory::allocate(*(m_env.engine), RLayout);
         auto rtmpPointer = rmem.pointer<char>();
 
-        auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
+        auto wLayer = as<InferenceEngine::WeightableLayer *> (layer);
         auto pWeightsBlob = wLayer->_weights;
         auto blobBytes = static_cast<const char *>(pWeightsBlob->buffer());
         const size_t WchunkSz = lstm_input_size * elementSize;
@@ -2875,7 +2885,7 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     cldnn::primitive_id weightID = layerName + m_weightsTag;
     cldnn::primitive_id recurrentID = layerName + "_recurrent" + m_weightsTag;
     cldnn::primitive_id biasID = layerName + m_biasesTag;
-    auto rnnLayer = dynamic_cast<InferenceEngine::RNNSequenceLayer*> (layer.get());
+    auto rnnLayer = as<InferenceEngine::RNNSequenceLayer*> (layer);
     bool permute_input = (1 != rnnLayer->axis);
 
     /* check incoming CNN layer and setup required variables */
@@ -2938,7 +2948,7 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         auto rmem = cldnn::memory::allocate(*(m_env.engine), RLayout);
         auto rtmpPointer = rmem.pointer<char>();
 
-        auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
+        auto wLayer = as<InferenceEngine::WeightableLayer *> (layer);
         auto pWeightsBlob = wLayer->_weights;
         auto blobBytes = static_cast<const char *>(pWeightsBlob->buffer());
         const size_t WchunkSz = lstm_input_size * elementSize;
@@ -3107,7 +3117,7 @@ void CLDNNGraph::AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer) {
 void CLDNNGraph::CreateConvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 1);
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto convLayer = dynamic_cast<InferenceEngine::ConvolutionLayer *> (layer.get());
+    auto convLayer = as<InferenceEngine::ConvolutionLayer *> (layer);
 
     std::vector<cldnn::primitive_id> weightPrimID;
     std::vector<cldnn::primitive_id> biasPrimID;
@@ -3156,7 +3166,7 @@ void CLDNNGraph::CreateGatherPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     ValidateLayer(layer, 2);
 
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto gatherLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto gatherLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     int axis = gatherLayer->GetParamAsInt("axis", 0);
 
@@ -3191,7 +3201,7 @@ void CLDNNGraph::CreateDepthToSpacePrimitive(InferenceEngine::CNNLayerPtr &layer
     ValidateLayer(layer, 1);
 
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto depthToSpace = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto depthToSpace = as<InferenceEngine::GenericLayer*> (layer);
 
     size_t blockSize = depthToSpace->GetParamAsInt("block_size", 2);
 
@@ -3218,7 +3228,7 @@ void CLDNNGraph::CreateShuffleChannelsPrimitive(InferenceEngine::CNNLayerPtr &la
     ValidateLayer(layer, 1);
 
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto shuffleChannels = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto shuffleChannels = as<InferenceEngine::GenericLayer*> (layer);
     const int32_t numberOfDims = shuffleChannels->input()->getDims().size();
 
     int32_t group = shuffleChannels->GetParamAsInt("group", 1);
@@ -3252,7 +3262,7 @@ void CLDNNGraph::CreateShuffleChannelsPrimitive(InferenceEngine::CNNLayerPtr &la
 
 void CLDNNGraph::CreateStridedSlicePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto stridedSliceLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto stridedSliceLayer = as<InferenceEngine::GenericLayer*> (layer);
 
     auto tmp = stridedSliceLayer->GetParamAsUInts("end_mask");
     std::vector<uint8_t> end_mask(tmp.begin(), tmp.end());
@@ -3278,7 +3288,7 @@ void CLDNNGraph::CreateReverseSequencePrimitive(InferenceEngine::CNNLayerPtr &la
     ValidateLayer(layer, 2);
 
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
-    auto reverseSequence = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+    auto reverseSequence = as<InferenceEngine::GenericLayer*> (layer);
     const int32_t numberOfDims = reverseSequence->input()->getDims().size();
 
     const auto input = reverseSequence->insData[0].lock()->getDims();
@@ -3329,9 +3339,9 @@ bool CLDNNGraph::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitL
     }
 
     auto convLayer1 =
-        dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[0]).get());
+        as<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[0]));
     auto convLayer2 =
-        dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[1]).get());
+        as<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[1]));
     if (!convLayer1 || !convLayer2) {   // outputs aren't convolutions
         return false;
     }
@@ -3353,8 +3363,8 @@ bool CLDNNGraph::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitL
         return false;
     }
     auto concatLayer =
-        dynamic_cast<InferenceEngine::ConcatLayer *> (
-                GetNextSingleLayer(GetNextSingleLayer(splitLayer->outData[0])).get());
+        as<InferenceEngine::ConcatLayer *> (
+                GetNextSingleLayer(GetNextSingleLayer(splitLayer->outData[0])));
     if (!concatLayer ||                         // not a merge layer
         concatLayer->_axis != 1 ||              // merge on unsupported axis
         concatLayer->outData.size() != 1) {     // too many outputs
@@ -3696,12 +3706,13 @@ void CLDNNGraph::CreateGenericLayerBlobPrimitives(const InferenceEngine::Generic
         if (blob.second->dims().size() != 1) {
             THROW_CLDNN_EXCEPTION("Unhandled blob dim in layer " + layer->name);
         }
-        CreatePrimitiveFromBlob(
-            layer->type + ":" + layer->name + "_" + blob.first + m_weightsTag,
-            blob.second,
-            cldnn::layout(
-                DataTypeFromPrecision(blob.second->precision()),
-                m_defaultFormat, cldnn::spatial(TensorValue(blob.second->dims()[0]))));
+
+        cldnn::layout genericLayout(DataTypeFromPrecision(blob.second->precision()),
+                                    m_defaultFormat,
+                                    cldnn::spatial(TensorValue(blob.second->dims()[0])));
+
+        CreatePrimitiveFromBlob(layer->type + ":" + layer->name + "_" + blob.first + m_weightsTag,
+                                blob.second, genericLayout);
     }
 }
 
index 0ea0649..f6391d5 100644 (file)
@@ -195,7 +195,7 @@ protected:
     static cldnn::softmax::dimension_t SoftmaxDimensionFromIEAxis(const InferenceEngine::SoftMaxLayer* softmaxLayer, bool isPrevFC = false);
     void CreatePrimitiveFromBlob(cldnn::primitive_id primID,
                                  const InferenceEngine::Blob::Ptr pBlob,
-                                 cldnn::layout blobLayout,
+                                 const cldnn::layout& blobLayout,
                                  size_t blobByteOffset = 0,
                                  WeightRearrangeType rearrange = NO_REARRANGE);
     void CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPtr& layer,
index c903a4f..6b0ea70 100644 (file)
@@ -359,7 +359,7 @@ void CLDNNInferRequest::SetBatch(int new_batch) {
     m_curBatch = new_batch;
 }
 
-CLDNNInferRequest::CLDNNInferRequest(InferenceEnv env, bool useProfiling,
+CLDNNInferRequest::CLDNNInferRequest(const InferenceEnv& env, bool useProfiling,
                                      InputsDataMap networkInputs, OutputsDataMap networkOutputs)
         : InferRequestInternal(networkInputs, networkOutputs),
           m_env(env),
index 375d707..4040c08 100644 (file)
@@ -27,7 +27,7 @@ public:
     void
     GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
 
-    CLDNNInferRequest(InferenceEnv env, bool useProfiling,
+    CLDNNInferRequest(const InferenceEnv& env, bool useProfiling,
                       InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs);
 
     CLDNNInferRequest(const CLDNNInferRequest &) = delete;
index 0e20681..6048262 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 39412b3..6c1f243 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 297f586..ec71223 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index aea45e9..45dd434 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 255e1ad..622ca49 100644 (file)
@@ -106,7 +106,7 @@ private:
     void pad_symmetric(const float *src_data, float* dst_data);
 
     PadMode padMode = CONSTANT;
-    float pad_value;
+    float pad_value = 0.f;
     SizeVector src_dims;
     SizeVector dst_dims;
     std::vector<unsigned int> pads_begin;
index a8e668b..b3c856e 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e431d49..4b264ab 100644 (file)
@@ -385,93 +385,100 @@ public:
             roi_indices_.resize(post_nms_topn_);
             addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)},
                       {DataConfigurator(ConfLayout::PLN)});
-        } catch (InferenceEngine::details::InferenceEngineException &ex) {
+        } catch (const InferenceEngine::details::InferenceEngineException &ex) {
             errorMsg = ex.what();
         }
     }
 
     StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
                        ResponseDesc *resp) noexcept override {
-        if (inputs.size() != 3 || outputs.empty()) {
+        try {
+            if (inputs.size() != 3 || outputs.empty()) {
+                THROW_IE_EXCEPTION << "Incorrect number of input or output edges!";
+            }
+
+            // Prepare memory
+            const float *p_bottom_item = inputs[0]->buffer();
+            const float *p_d_anchor_item = inputs[1]->buffer();
+            const float *p_img_info_cpu = inputs[2]->buffer();
+            float *p_roi_item = outputs[0]->buffer();
+
+            size_t img_info_size = inputs[2]->getTensorDesc().getDims()[1];
+
+            // No second output so ignoring this
+            // Dtype* p_score_item = (top.size() > 1) ? top[1]->mutable_cpu_data() : NULL;
+
+            // bottom shape: (2 x num_anchors) x H x W
+            const int bottom_H = inputs[0]->getTensorDesc().getDims()[2];
+            const int bottom_W = inputs[0]->getTensorDesc().getDims()[3];
+
+            // input image height & width
+            const float img_H = p_img_info_cpu[swap_xy ? 1 : 0];
+            const float img_W = p_img_info_cpu[swap_xy ? 0 : 1];
+
+            // scale factor for height & width
+            const float scale_H = p_img_info_cpu[2];
+            const float scale_W = img_info_size > 3 ? p_img_info_cpu[3] : scale_H;
+
+            // minimum box width & height
+            const float min_box_H = min_size_ * scale_H;
+            const float min_box_W = min_size_ * scale_W;
+
+            // number of all proposals = num_anchors * H * W
+            const int num_proposals = anchors_shape_0 * bottom_H * bottom_W;
+
+            // number of top-n proposals before NMS
+            const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
+
+            // number of final RoIs
+            int num_rois = 0;
+
+            // enumerate all proposals
+            //   num_proposals = num_anchors * H * W
+            //   (x1, y1, x2, y2, score) for each proposal
+            // NOTE: for bottom, only foreground scores are passed
+            struct ProposalBox {
+                float x0;
+                float y0;
+                float x1;
+                float y1;
+                float score;
+            };
+            std::vector<ProposalBox> proposals_(num_proposals);
+            std::vector<float> unpacked_boxes(4 * pre_nms_topn);
+            std::vector<int> is_dead(pre_nms_topn);
+
+            // Execute
+            int nn = inputs[0]->getTensorDesc().getDims()[0];
+            for (int n = 0; n < nn; ++n) {
+                enumerate_proposals_cpu(p_bottom_item + num_proposals + n * num_proposals * 2,
+                                        p_d_anchor_item + n * num_proposals * 4,
+                                        &anchors_[0], reinterpret_cast<float *>(&proposals_[0]),
+                                        anchors_shape_0, bottom_H, bottom_W, img_H, img_W,
+                                        min_box_H, min_box_W, feat_stride_,
+                                        box_coordinate_scale_, box_size_scale_,
+                                        coordinates_offset, initial_clip, swap_xy, clip_before_nms);
+                std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
+                                  [](const ProposalBox &struct1, const ProposalBox &struct2) {
+                                      return (struct1.score > struct2.score);
+                                  });
+
+                unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
+                nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, nms_thresh_,
+                        post_nms_topn_, coordinates_offset);
+                retrieve_rois_cpu(num_rois, n, pre_nms_topn, &unpacked_boxes[0], &roi_indices_[0],
+                                  p_roi_item + n * post_nms_topn_ * 5,
+                                  post_nms_topn_, normalize_, img_H, img_W, clip_after_nms);
+            }
+
+            return OK;
+        } catch (const InferenceEngine::details::InferenceEngineException& e) {
             if (resp) {
-                std::string errorMsg = "Incorrect number of input or output edges!";
+                std::string errorMsg = e.what();
                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
             }
             return GENERAL_ERROR;
         }
-
-        // Prepare memory
-        const float* p_bottom_item = inputs[0]->buffer();
-        const float* p_d_anchor_item = inputs[1]->buffer();
-        const float* p_img_info_cpu = inputs[2]->buffer();
-        float* p_roi_item = outputs[0]->buffer();
-
-        size_t img_info_size = inputs[2]->getTensorDesc().getDims()[1];
-
-        // No second output so ignoring this
-        // Dtype* p_score_item = (top.size() > 1) ? top[1]->mutable_cpu_data() : NULL;
-
-        // bottom shape: (2 x num_anchors) x H x W
-        const int bottom_H = inputs[0]->getTensorDesc().getDims()[2];
-        const int bottom_W = inputs[0]->getTensorDesc().getDims()[3];
-
-        // input image height & width
-        const float img_H = p_img_info_cpu[swap_xy ? 1 : 0];
-        const float img_W = p_img_info_cpu[swap_xy ? 0 : 1];
-
-        // scale factor for height & width
-        const float scale_H = p_img_info_cpu[2];
-        const float scale_W = img_info_size > 3 ? p_img_info_cpu[3] : scale_H;
-
-        // minimum box width & height
-        const float min_box_H = min_size_ * scale_H;
-        const float min_box_W = min_size_ * scale_W;
-
-        // number of all proposals = num_anchors * H * W
-        const int num_proposals = anchors_shape_0 * bottom_H * bottom_W;
-
-        // number of top-n proposals before NMS
-        const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
-
-        // number of final RoIs
-        int num_rois = 0;
-
-        // enumerate all proposals
-        //   num_proposals = num_anchors * H * W
-        //   (x1, y1, x2, y2, score) for each proposal
-        // NOTE: for bottom, only foreground scores are passed
-        struct ProposalBox {
-            float x0;
-            float y0;
-            float x1;
-            float y1;
-            float score;
-        };
-        std::vector<ProposalBox> proposals_(num_proposals);
-        std::vector<float> unpacked_boxes(4 * pre_nms_topn);
-        std::vector<int> is_dead(pre_nms_topn);
-
-        // Execute
-        int nn = inputs[0]->getTensorDesc().getDims()[0];
-        for (int n = 0; n < nn; ++n) {
-            enumerate_proposals_cpu(p_bottom_item + num_proposals + n*num_proposals*2, p_d_anchor_item + n*num_proposals*4,
-                                    &anchors_[0], reinterpret_cast<float *>(&proposals_[0]),
-                                    anchors_shape_0, bottom_H, bottom_W, img_H, img_W,
-                                    min_box_H, min_box_W, feat_stride_,
-                                    box_coordinate_scale_, box_size_scale_,
-                                    coordinates_offset, initial_clip, swap_xy, clip_before_nms);
-            std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
-                              [](const ProposalBox& struct1, const ProposalBox& struct2) {
-                                  return (struct1.score > struct2.score);
-                              });
-
-            unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
-            nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, nms_thresh_, post_nms_topn_, coordinates_offset);
-            retrieve_rois_cpu(num_rois, n, pre_nms_topn, &unpacked_boxes[0], &roi_indices_[0], p_roi_item + n*post_nms_topn_*5,
-                              post_nms_topn_, normalize_, img_H, img_W, clip_after_nms);
-        }
-
-        return OK;
     }
 
 private:
@@ -507,16 +514,20 @@ public:
     // set output shapes by input shapes.
     StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
                          ResponseDesc *resp) noexcept override {
-        if (inShapes.size() != 1) {
+        try {
+            if (inShapes.size() != 1) {
+                THROW_IE_EXCEPTION << "Incorrect input shapes!";
+            }
+            outShapes.clear();
+            outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
+            return OK;
+        } catch (const InferenceEngine::details::InferenceEngineException& e) {
             if (resp) {
-                std::string errorMsg = "Incorrect input shapes!";
+                std::string errorMsg = e.what();
                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
             }
             return GENERAL_ERROR;
         }
-        outShapes.clear();
-        outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
-        return OK;
     }
 };
 
index 43ce9a0..18cc8b3 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -422,16 +422,20 @@ public:
     // set output shapes by input shapes.
     StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
                          ResponseDesc *resp) noexcept override {
-        if (inShapes.size() != 1) {
+        try {
+            if (inShapes.size() != 1) {
+                THROW_IE_EXCEPTION << "Incorrect input shapes!";
+            }
+            outShapes.clear();
+            outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
+            return OK;
+        } catch (const InferenceEngine::details::InferenceEngineException& e) {
             if (resp) {
-                std::string errorMsg = "Incorrect input shapes!";
+                std::string errorMsg = e.what();
                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
             }
             return GENERAL_ERROR;
         }
-        outShapes.clear();
-        outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout());
-        return OK;
     }
 };
 
index d438df8..995924f 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5780ef2..b536923 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 8c7a096..122a313 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 79b23da..9988eeb 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e00bc0a..23cdb3d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index a745031..88d6553 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 4a94059..4773770 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0584bd5..959988e 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0fda31c..1b986ab 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 76f94cb..f7e9ce7 100644 (file)
@@ -1,8 +1,7 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dnn.cpp : component based neural network class for ease of use
-//
+
 extern bool global_debug;
 
 #include <cstdlib>
index 0d89a2d..38bfcb0 100644 (file)
@@ -252,7 +252,9 @@ class AmIntelDnn {
           ptr_sumgroup_sizes(NULL),
           num_sumgroup_sizes(0),
           ptr_priors(NULL),
-          ptr_dnn_memory_(NULL) {
+          ptr_dnn_memory_(NULL),
+          num_bytes_dnn_memory_(0),
+          number_type_(kDnnNumNumberType) {
     }
 
     ~AmIntelDnn() {
index dec7907..7caecf2 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dnn_memory.cpp : memory manipulation routines
-//
 
 #include <cstdio>
 #include <cstdlib>
index 43720f7..4c8551d 100644 (file)
@@ -1,7 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dnn_memory.hpp : memory manipulation routines
 
 #pragma once
 
index 98238df..fe39a42 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dnn_traits.hpp : c++ trait approach to  define dnn objects
-//
 
 #pragma once
 
index 72f3b3e..f662b80 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// floatmath.cpp : unoptimized floating point math routines (for reference)
-//
 
 #include "floatmath.h"
 #include "pwl.h"
index 1328ef5..8b5c7e6 100644 (file)
@@ -43,6 +43,7 @@ class CPPWrapper<intel_nnet_type_t> {
         for (int i = 0; i < obj.nLayers; i++) {
             obj.pLayers[i].pLayerStruct = nullptr;
         }
+        obj.nGroup = 0;
     }
     ~CPPWrapper() {
         for (int i = 0; i < obj.nLayers; i++) {
index 344d44e..626383b 100644 (file)
@@ -115,6 +115,8 @@ void GNADeviceHelper::updateGnaPerfCounters() {
 void GNADeviceHelper::getGnaPerfCounters(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& retPerfCounters) {
     InferenceEngine::InferenceEngineProfileInfo info;
     info.status = InferenceEngine::InferenceEngineProfileInfo::EXECUTED;
+    info.cpu_uSec = 0;
+    info.execution_index = 0;
 
     // Hardware
     info.realTime_uSec = nGNAPerfResultsTotal.hw.total;
index 7d26aaf..20b749f 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-//  gna_helper.cpp : various GNA-related utility functions
-//
 
 #include "lstm.hpp"
 
index 5851a86..770a82e 100644 (file)
@@ -100,12 +100,16 @@ class LayerInfo {
     bool isEltwiseSum() const noexcept {
         IS_VALID();
         if (!isEltwise()) return false;
-        return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
-            InferenceEngine::EltwiseLayer::Sum;
+        // dynamic_cast<const InferenceEngine::EltwiseLayer *>(layer) is validated in isEltwise function
+        // coverity[var_deref_op]
+        return dynamic_cast<const InferenceEngine::EltwiseLayer *>(layer)->_operation ==
+               InferenceEngine::EltwiseLayer::Sum;
     }
     bool isEltwiseMul() const noexcept {
         IS_VALID();
         if (!isEltwise()) return false;
+        // dynamic_cast<const InferenceEngine::EltwiseLayer *>(layer) is validated in isEltwise function
+        // coverity[var_deref_op]
         return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
             InferenceEngine::EltwiseLayer::Prod;
     }
@@ -156,8 +160,13 @@ class LayerInfo {
     }
     bool isCropAffined() const noexcept {
         auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer);
-        size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
-        return (ALIGN64(cropOffset) != cropOffset);
+        if (cropLayer != nullptr && !cropLayer->offset.empty()) {
+            try {
+                size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+                return (ALIGN64(cropOffset) != cropOffset);
+            } catch (InferenceEngine::details::InferenceEngineException& e) {}
+        }
+        return false;
     }
     bool isCopy() const noexcept {
         IS_VALID();
index 99d0731..e15e340 100644 (file)
@@ -36,7 +36,7 @@ struct MemRequest {
     uint8_t _element_size;
     size_t _num_elements;
     size_t _alignment;
-    size_t _offset;
+    size_t _offset = 0;
     // expansion in bytes due to large depended layers
     size_t _padding = 0;
     MemRequest(rRegion region,
index 30be460..ef4a4f3 100644 (file)
@@ -106,19 +106,19 @@ class GNAModelSerial {
         /**
          * if scale factor is different then pased into infer , network might need to be requantized
          */
-        float scaleFactor;
+        float scaleFactor = 0;
         /**
          * Pointer descriptor
          */
-        void* descriptor_ptr;
+        void* descriptor_ptr = nullptr;
         /**
          * Endpoint resolution in bytes.
          */
-        uint32_t element_size;
+        uint32_t element_size = 0;
         /**
          * Number of elements
          */
-        uint32_t elements_count;
+        uint32_t elements_count = 0;
 
         RuntimeEndPoint() = default;
         RuntimeEndPoint(double scaleFactor,
index fc57d52..603b1f3 100644 (file)
@@ -1275,7 +1275,12 @@ void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
         THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
     }
     auto activation_type = DnnActivation::fromType(it->second);
-    activation_type.negative_slope = (it->second == kActRelu) ? dynamic_cast<ReLULayer*>(layer.get())->negative_slope : 0.0f;
+    if (it->second == kActRelu) {
+        auto reluLayer = dynamic_cast<ReLULayer *>(layer.get());
+        activation_type.negative_slope = reluLayer != nullptr ? reluLayer->negative_slope : 0.0f;
+    } else {
+        activation_type.negative_slope = 0.0f;
+    }
 
     // TODO: need to take graph dependency instead of linear
     auto &prevComponent = dnnComponentsForLayer.back().second;
@@ -1649,20 +1654,23 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
         CreateLayerPrimitive(*layer);
     }
+    if (dnnComponentsForLayer.empty()) {
+        THROW_GNA_EXCEPTION << "No outputs found in dnn components structure";
+    }
+
     DnnComponentsForLayer::iterator output_component = std::find_if(dnnComponentsForLayer.begin(),
                                                         dnnComponentsForLayer.end(),
                                                         [&](const std::pair<std::string, intel_dnn_component_t>& v)
                                                         { return outputsDataMap.begin()->first == v.first; });
 
     if (output_component == dnnComponentsForLayer.end()) {
-        if (dnnComponentsForLayer.empty()) {
-            THROW_GNA_EXCEPTION << "No outputs found in internal structures";
-        }
         // likely layer is fused. Take last one
-        output_component = std::prev(dnnComponentsForLayer.end());
+        auto it = dnnComponentsForLayer.begin();
+        std::advance(it, dnnComponentsForLayer.size() - 1);
+        output_component = it;
         gnalog() << "Output layer "<< outputsDataMap.begin()->first
-                    << " has not been found in component list. Took  "
-                    << output_component->first << " instead \n" << std::flush;
+            << " has not been found in component list. Took  "
+            << output_component->first << " instead \n" << std::flush;
     }
     gnamem->bind_ptr(&ptr_outputs_global.front(), &output_component->second.ptr_outputs);
 
@@ -1775,6 +1783,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     orientation_out = output_component->second.orientation_out;
     num_bytes_per_output = output_component->second.num_bytes_per_output;
 
+    if (sortedNet.empty()) {
+        THROW_GNA_EXCEPTION << "Sorted network is empty";
+    }
+
     // find output layer
     auto output = std::find_if(sortedNet.begin(),
                                 sortedNet.end(),
@@ -1782,7 +1794,9 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
                                 { return outputsDataMap.begin()->first == v.get()->name; });
     if (output == sortedNet.end()) {
         // likely layer is fused. Take last one
-        output = std::prev(sortedNet.end());
+        auto it = sortedNet.begin();
+        std::advance(it, sortedNet.size() - 1);
+        output = it;
     }
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(*output);
     output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
@@ -2461,36 +2475,38 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi
                                             [&name](GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) {
                                                 return item.name == name;
                                             });
-                    // reserve full size for concat
-                    if (!concatLayerInfoItem.output_allocation_flag) {
-                        // check if this concat is being included by other one
-                        // by going thru each concat and checking inputs
-                        auto included =
-                            std::find_if(concat_connection.begin(),
-                                           concat_connection.end(),
-                               [&concatLayerInfo]
-                                    (const std::pair<std::string, GNAPlugin::GNAConcatLayer> &concatItem) -> bool {
-                                        auto it = std::find_if(concatItem.second.concatInputLayers.begin(),
-                                                        concatItem.second.concatInputLayers.end(),
-                                                        [&concatLayerInfo]
-                                                            (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool {
-                                                                            return item.name == concatLayerInfo->first;
-                                                            });
-                                        return it != concatItem.second.concatInputLayers.end();
-                                    });
-                        if (included == concat_connection.end()) {
-                            gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
-
-                            for (auto && inputLayer : concatLayerInfoItem.concatInputLayers) {
-                                if ( InferenceEngine::details::CaselessEq<std::string>()
-                                                                    (inputLayer.name, "input") ) {
-                                    bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset;
+                    if (it != concatLayerInfoItem.concatInputLayers.end()) {
+                        // reserve full size for concat
+                        if (!concatLayerInfoItem.output_allocation_flag) {
+                            // check if this concat is being included by other one
+                            // by going thru each concat and checking inputs
+                            auto included =
+                                    std::find_if(concat_connection.begin(),
+                                                 concat_connection.end(),
+                                                 [&concatLayerInfo]
+                                                         (const std::pair<std::string, GNAPlugin::GNAConcatLayer> &concatItem) -> bool {
+                                                     auto it = std::find_if(concatItem.second.concatInputLayers.begin(),
+                                                                            concatItem.second.concatInputLayers.end(),
+                                                                            [&concatLayerInfo]
+                                                                                    (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool {
+                                                                                return item.name == concatLayerInfo->first;
+                                                                            });
+                                                     return it != concatItem.second.concatInputLayers.end();
+                                                 });
+                            if (included == concat_connection.end()) {
+                                gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
+
+                                for (auto &&inputLayer : concatLayerInfoItem.concatInputLayers) {
+                                    if (InferenceEngine::details::CaselessEq<std::string>()
+                                            (inputLayer.name, "input")) {
+                                        bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset;
+                                    }
                                 }
                             }
+                            concatLayerInfo->second.output_allocation_flag = true;
                         }
-                        concatLayerInfo->second.output_allocation_flag = true;
+                        gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset);
                     }
-                    gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset);
                 } else {
                     // error
                 }
index 34bc866..a2617c2 100644 (file)
@@ -67,7 +67,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
 
 
     uint32_t num_feature_maps = 1;
-    uint32_t num_memory_bytes;
+    uint32_t num_memory_bytes = 0;
 
     std::unordered_map<std::string, std::list<std::vector<void *>>::iterator> ptr_inputs_global_id;
     std::list<std::vector<void *>> ptr_inputs_global_storage;
@@ -79,7 +79,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
     uint32_t *ptr_active_indices = NULL;
     uint32_t num_active_indices = 0;
     uint32_t num_group_in = 0;
-    uint32_t num_bytes_weight;
+    uint32_t num_bytes_weight = 0;
     uint32_t num_bytes_per_output = 0;
 
     bool use_dynamic_quantization = false;
index 22cf3c0..ddb2244 100644 (file)
@@ -411,7 +411,7 @@ void GNAPlugin::insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & laye
         if ((LayerInfo(l).isMemory() && LayerInfo(prevLayer).isConcat()) ||
             (LayerInfo(l).isConcat() && LayerInfo(prevLayer).isCrop())) {
             if (LayerInfo(prevLayer).isCrop()) {
-                auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (prevLayer.get());
+                auto cropLayer =  LayerInfo(prevLayer).as<CropLayer*>();
                 size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
                 if (ALIGN(cropOffset, 8) != cropOffset) {
                     // The crop will be replced by affine.
index e1c0f7e..62eb6d1 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// lstm.cpp : GNA LSTM macro layer definition
-//
 
 #include "lstm.hpp"
 
index 2d150df..b80ae5f 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-//  pwl_design.cpp : simple activation function designer
-//
 
 #include "pwl.h"
 #include "gna_plugin_log.hpp"
index 442be42..3f3fcf3 100644 (file)
@@ -301,6 +301,9 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
     auto inputData = conv->insData[0].lock();
 
     uint32_t num_rows = getBiasSizeForLayer(conv);
+    if (num_rows == 0) {
+        THROW_GNA_EXCEPTION << "Invalid num rows";
+    }
     uint32_t num_columns = conv->_weights->size() / num_rows;
 
     uint32_t num_rows_padded = num_rows;
index c0f1852..d02803f 100644 (file)
@@ -34,7 +34,9 @@ class ModelQuantizer {
         // one of solution is to create not copyNet overloads, that accepts 2 functors, one for layer copy
         // and another one for net copy
         auto rawNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(copiedNet.get());
-        rawNet->setPrecision(T::mandatory().getNetPrecision());
+        if (rawNet != nullptr) {
+            rawNet->setPrecision(T::mandatory().getNetPrecision());
+        }
 
         // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
         // another preprocessing
index 1609d5d..fa7e263 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <cstring>
 #include <iostream>
+#include <details/ie_exception.hpp>
 #include "quantization.h"
 
 void QuantizeAffine16(float *ptr_float_weights,
@@ -496,6 +497,9 @@ void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
                      float input_scale_factor, float *ptr_weight_scale_factor,
                      float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
                      uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    if (ptr_int_biases == nullptr) {
+        THROW_IE_EXCEPTION << "Int biases are empty";
+    }
     uint32_t num_saturate = 0;
 
     if (*ptr_weight_scale_factor == 1.0) {
@@ -547,11 +551,11 @@ void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
         value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
         ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5);
         for (uint32_t col = 0; col < num_columns; col++) {
-            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
             rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 
 
-            value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
+            value = ptr_float_weights[row * num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
             if (value > 127.0) {
                 *ptr_weight_8 = 127;
                 num_saturate++;
@@ -559,11 +563,11 @@ void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
                 *ptr_weight_8 = -128;
                 num_saturate++;
             } else {
-                *ptr_weight_8 = (int8_t)value;
+                *ptr_weight_8 = (int8_t) value;
             }
         }
         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
-            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
             *ptr_weight_8 = 0;
         }
     }
index 1585463..23f63af 100644 (file)
@@ -191,7 +191,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                                 continue;
                             } else if (info.has16BOutput() && info.isActivation()) {
                                 auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
-                                if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
+                                if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
                                     break;
                                 }
                                 auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
@@ -413,7 +413,9 @@ class ScaleFactorCalculator {
             }
             return ptr == cnnLayer.get();
         });
-        idx++;
+        if (idx != net.end()) {
+            idx++;
+        }
         needRestart = true;
         return true;
     }
index e6f5776..1ff5a82 100644 (file)
@@ -1,8 +1,6 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// util.cpp : various utility functions for debugging, file i/o, etc.
-//
 
 #include <cinttypes>
 #ifndef _WIN32
index 9288db7..d6e53a0 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "fallback_policy.h"
@@ -66,7 +54,7 @@ void FallbackPolicy::init(const std::string &config, const std::map<std::string,
         if (_deviceLoaders.find(d) == _deviceLoaders.end()) {
             IHeteroDeviceLoader::Ptr loader;
             loader = std::make_shared<HeteroDeviceLoader>(d);
-            HeteroDeviceLoader *pdl = dynamic_cast<HeteroDeviceLoader *>(loader.get());
+            HeteroDeviceLoader *pdl = static_cast<HeteroDeviceLoader *>(loader.get());
             pdl->initConfigs(allConfigs, extensions);
             _deviceLoaders[d] = loader;
         }
index 5547ee8..abd245a 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index 5aa360b..9e58cda 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "hetero_async_infer_request.h"
index d09ada9..004cc5d 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 /**
index 589388e..b1d68a2 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "hetero_device_loader.h"
index f9b9e4c..3cef157 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index b6f4286..8d2de53 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "hetero_executable_network.h"
@@ -212,7 +200,7 @@ void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
             // TODO: here is a duplication of the code with FallbackPolicy::init
             IHeteroDeviceLoader::Ptr loader;
             loader = std::make_shared<HeteroDeviceLoader>(affinity);
-            HeteroDeviceLoader *pdl = dynamic_cast<HeteroDeviceLoader *>(loader.get());
+            HeteroDeviceLoader *pdl = static_cast<HeteroDeviceLoader *>(loader.get());
             pdl->initConfigs(config, extensions);
             _deviceLoaders[affinity] = loader;
         }
index 08e4bd7..075da90 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 /**
index 81349f9..9c02ae3 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "hetero_infer_request.h"
index 7633022..8e171af 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 /**
index 987e703..523aedf 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "hetero_plugin.h"
index 671463d..564c731 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index e2e166b..09ab54f 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright (C) 2018-2019 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 /**
index 42e1a14..af7d283 100644 (file)
@@ -143,13 +143,13 @@ REG_VALIDATOR_FOR(DetectionOutput, [](const InferenceEngine::Builder::Layer::CPt
         THROW_IE_EXCEPTION << "BackgroundLabelId parameter is wrong in layer " << layer.getName() <<
                            ". It should be >= 0 if this one is an Id of existing label else it should be equal to -1";
     }
-    if (layer.getNMSThreshold() <= 0) {
+    if (layer.getNMSThreshold() < 0) {
         THROW_IE_EXCEPTION << "NMSThreshold parameter is wrong in layer " << layer.getName() <<
-                           ". It should be > 0.";
+                           ". It should be >= 0.";
     }
-    if (layer.getConfidenceThreshold() <= 0) {
+    if (layer.getConfidenceThreshold() < 0) {
         THROW_IE_EXCEPTION << "ConfidenceThreshold parameter is wrong in layer " << layer.getName() <<
-                           ". It should be > 0.";
+                           ". It should be >= 0.";
     }
 });
 
index df51f5e..516e645 100644 (file)
@@ -130,9 +130,9 @@ Builder::EltwiseLayer& Builder::EltwiseLayer::setEltwiseType(Builder::EltwiseLay
 REG_VALIDATOR_FOR(Eltwise, [](const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) {
     Builder::EltwiseLayer layer(input_layer);
 
-    if (layer.getInputPorts().size() != 2) {
+    if (layer.getInputPorts().size() < 2) {
         THROW_IE_EXCEPTION << "Input ports are incorrect in the layer " << layer.getName()
-                           << ". Number of input ports should be equal to 2.";
+                           << ". Number of input ports should be >= 2.";
     }
     if (partial && (layer.getInputPorts()[0].shape().empty() || layer.getInputPorts()[1].shape().empty() ||
             layer.getOutputPort().shape().empty()))
@@ -153,5 +153,3 @@ REG_CONVERTER_FOR(Eltwise, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer
     layer.getParameters()["scales"] = cnnLayer->GetParamAsFloats("scales", {});
     layer.getParameters()["operation"] = cnnLayer->GetParamAsString("operation");
 });
-
-
index 8bd20a7..f42f08d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 2899cfd..5e155d0 100644 (file)
@@ -713,15 +713,30 @@ Builder::Layer::Ptr Builder::Network::getLayer(idx_t layerId) {
 }
 
 const std::string& Builder::Network::getName() const noexcept {
-    return parameters.at("name");
+    static std::string errName;
+    try {
+        return parameters.at("name");
+    } catch (...) {
+        return errName;
+    }
 }
 
 const Context& Builder::Network::getContext() const noexcept {
-    return parameters.at("context");
+    static Context errCtx;
+    try {
+        return parameters.at("context");
+    } catch (...) {
+        return errCtx;
+    }
 }
 
 Context& Builder::Network::getContext() noexcept {
-    return parameters.at("context");
+    static Context errCtx;
+    try {
+        return parameters.at("context");
+    } catch (...) {
+        return errCtx;
+    }
 }
 
 Builder::Network::const_iterator Builder::Network::begin() const noexcept {
@@ -751,18 +766,20 @@ Builder::Network::iterator Builder::Network::end() {
 
 const std::vector<ILayer::CPtr> Builder::Network::getInputs() const noexcept {
     std::vector<ILayer::CPtr> inputs;
-    for (const auto& layer : parameters.at("layers").as<std::vector<Layer::Ptr>>()) {
-        bool isInputLayer = true;
-        for (const auto& connection : getLayerConnections(layer->getId())) {
-            if (connection.to().layerId() == layer->getId()) {
-                isInputLayer = false;
-                break;
+    try {
+        for (const auto& layer : parameters.at("layers").as<std::vector<Layer::Ptr>>()) {
+            bool isInputLayer = true;
+            for (const auto& connection : getLayerConnections(layer->getId())) {
+                if (connection.to().layerId() == layer->getId()) {
+                    isInputLayer = false;
+                    break;
+                }
+            }
+            if (isInputLayer) {
+                inputs.push_back(layer->build());
             }
         }
-        if (isInputLayer) {
-            inputs.push_back(layer->build());
-        }
-    }
+    } catch (...) {}
     return inputs;
 }
 
@@ -785,18 +802,20 @@ std::vector<Builder::Layer::Ptr> Builder::Network::getInputs() {
 
 const std::vector<ILayer::CPtr> Builder::Network::getOutputs() const noexcept {
     std::vector<ILayer::CPtr> outputs;
-    for (const auto& layer : parameters.at("layers").as<std::vector<Layer::Ptr>>()) {
-        bool isOutputLayer = true;
-        for (const auto& connection : getLayerConnections(layer->getId())) {
-            if (connection.from().layerId() == layer->getId()) {
-                isOutputLayer = false;
-                break;
+    try {
+        for (const auto& layer : parameters.at("layers").as<std::vector<Layer::Ptr>>()) {
+            bool isOutputLayer = true;
+            for (const auto& connection : getLayerConnections(layer->getId())) {
+                if (connection.from().layerId() == layer->getId()) {
+                    isOutputLayer = false;
+                    break;
+                }
+            }
+            if (isOutputLayer) {
+                outputs.push_back(layer->build());
             }
         }
-        if (isOutputLayer) {
-            outputs.push_back(layer->build());
-        }
-    }
+    } catch (...) {}
     return outputs;
 }
 
@@ -823,9 +842,11 @@ const std::vector<Connection>& Builder::Network::getConnections() const {
 
 const std::vector<Connection> Builder::Network::getLayerConnections(idx_t layerId) const noexcept {
     std::vector<Connection> layerConnections;
-    for (const auto connection : parameters.at("connections").as<std::vector<Connection>>()) {
-        if (connection.from().layerId() == layerId || connection.to().layerId() == layerId)
-            layerConnections.push_back(connection);
-    }
+    try {
+        for (const auto connection : parameters.at("connections").as<std::vector<Connection>>()) {
+            if (connection.from().layerId() == layerId || connection.to().layerId() == layerId)
+                layerConnections.push_back(connection);
+        }
+    } catch (...) {}
     return layerConnections;
 }
index 2918da1..5019fa3 100644 (file)
@@ -239,38 +239,54 @@ StatusCode CNNNetworkImpl::serialize(const std::string &xmlPath, const std::stri
 }
 
 StatusCode CNNNetworkImpl::setBatchSize(size_t size, ResponseDesc* responseDesc) noexcept {
-    auto originalBatchSize = getBatchSize();
-    if (originalBatchSize == size)
-        return OK;
-    SizeVector dims = _inputData.cbegin()->second->getDims();
-
-    // 3D input layout doesn't have batch notation
-    if (dims.size() == 3 || dims.size() == 1) {
-        return DescriptionBuffer(PARAMETER_MISMATCH, responseDesc) << "Cannot set batch for 1D/3D input";
-    }
+    try {
+        auto originalBatchSize = getBatchSize();
+        if (originalBatchSize == size)
+            return OK;
+        SizeVector inputDims = _inputData.cbegin()->second->getDims();
+
+        // 3D input layout doesn't have batch notation
+        if (inputDims.size() == 3 || inputDims.size() == 1) {
+            return DescriptionBuffer(PARAMETER_MISMATCH, responseDesc) << "Cannot set batch for 1D/3D input";
+        }
 
-    for (auto layer : _data) {
-        SizeVector dims = layer.second->getDims();
-        // Calculates original size for batch = 1
-        size_t diff = dims.at(0) / originalBatchSize;
-        dims.at(0) = size * diff;
-        layer.second->setDims(dims);
+        for (const auto &layer : _data) {
+            SizeVector dims = layer.second->getDims();
+            // Calculates original size for batch = 1
+            size_t diff = dims.at(0) / originalBatchSize;
+            dims.at(0) = size * diff;
+            layer.second->setDims(dims);
+        }
+        return OK;
+    } catch (const InferenceEngineException& e) {
+        return DescriptionBuffer(GENERAL_ERROR, responseDesc) << e.what();
+    } catch (const std::exception& e) {
+        return DescriptionBuffer(UNEXPECTED, responseDesc) << e.what();
+    } catch (...) {
+        return DescriptionBuffer(UNEXPECTED, responseDesc);
     }
-    return OK;
 }
 
 StatusCode CNNNetworkImpl::setBatchSizeReshape(size_t size, ResponseDesc* responseDesc) noexcept {
     InputShapes inputShapes;
-    for (const auto& pair : _inputData) {
-        auto info = pair.second;
-        if (info) {
-            auto data = info->getInputData();
-            if (data) {
-                auto dims = data->getTensorDesc().getDims();
-                dims[0] = size;
-                inputShapes[data->name] = dims;
+    try {
+        for (const auto& pair : _inputData) {
+            auto info = pair.second;
+            if (info) {
+                auto data = info->getInputData();
+                if (data) {
+                    auto dims = data->getTensorDesc().getDims();
+                    dims[0] = size;
+                    inputShapes[data->name] = dims;
+                }
             }
         }
+        return reshape(inputShapes, responseDesc);
+    } catch (const InferenceEngineException& e) {
+        return DescriptionBuffer(GENERAL_ERROR, responseDesc) << e.what();
+    } catch (const std::exception& e) {
+        return DescriptionBuffer(UNEXPECTED, responseDesc) << e.what();
+    } catch (...) {
+        return DescriptionBuffer(UNEXPECTED, responseDesc);
     }
-    return reshape(inputShapes, responseDesc);
 }
index 435c24d..b34906f 100644 (file)
@@ -203,7 +203,7 @@ CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const {
 void CNNStatisticHelper::NormalizeStatistic() {
     StatsMap newMap;
 
-    float dummy;
+    float dummy = 0.0f;
 
     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network_);
     for (auto l : sortedLayers) {
@@ -319,6 +319,9 @@ void CNNStatisticHelper::NormalizeStatistic() {
                 } else if (CaselessEq<std::string>()(tl->type, "convolution")) {
                     // verify number of groups
                     ConvolutionLayer *pConv = dynamic_cast<ConvolutionLayer *>(tl.get());
+                    if (pConv == nullptr) {
+                        THROW_IE_EXCEPTION << "Layer " << tl->name << " is not instance of ConvolutionLayer class";
+                    }
                     if (pConv->_group != pConv->_out_depth) {
                         perChannelScale = false;
                     }
@@ -539,6 +542,9 @@ void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLa
 
         {
             ScaleShiftLayer* scshLayer = dynamic_cast<ScaleShiftLayer*>(ssCnnLayer.get());
+            if (scshLayer == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class";
+            }
             fillInScaleShift(scshLayer, c, oScaleBuffer, iScaleBuffer);
         }
 
@@ -673,7 +679,10 @@ CNNLayer::Ptr CNNNetworkInt8Normalizer::createDWConvolutionForScale(const std::s
     params.type = "Convolution";
 
     CNNLayerPtr lptr = std::make_shared<ConvolutionLayer>(params);
-    ConvolutionLayer *pConv = dynamic_cast<ConvolutionLayer *>(lptr.get());
+    auto *pConv = dynamic_cast<ConvolutionLayer *>(lptr.get());
+    if (pConv == nullptr) {
+        THROW_IE_EXCEPTION << "Layer " << lptr->name << " is not instance of ConvolutionLayer class";
+    }
 
     pConv->_kernel.insert(X_AXIS, 1);
     pConv->_kernel.insert(Y_AXIS, 1);
@@ -969,6 +978,10 @@ bool CNNNetworkInt8Normalizer::isNextFusionAllowed(const CNNLayer::Ptr& layer) {
         std::string aType = layer->outData[0]->inputTo.begin()->second->type;
         if (CaselessEq<std::string>()(aType, "relu")) {
             ReLULayer *rL = dynamic_cast<ReLULayer *>(layer->outData[0]->inputTo.begin()->second.get());
+            if (rL == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << layer->outData[0]->inputTo.begin()->second->name
+                                   << " is not instance of ReLULayer class";
+            }
             if (rL->negative_slope != 0.f) {
                 return false;
             }
index 0df1242..65b9b59 100644 (file)
@@ -24,7 +24,7 @@ Task::Task() : _status(TS_INITIAL) {
     };
 }
 
-Task::Task(std::function<void()> function) : _status(TS_INITIAL), _function(function) {
+Task::Task(const std::function<void()> &function) : _status(TS_INITIAL), _function(function) {
     if (!function) THROW_IE_EXCEPTION << "Failed to create Task object with null function";
 }
 
index c299be4..aba3c13 100644 (file)
@@ -41,7 +41,7 @@ public:
 
     Task();
 
-    Task(std::function<void()> function);
+    explicit Task(const std::function<void()> &function);
 
     /**
      * @brief Executes the task with catching all exceptions. It doesn't check that task is running
index 1e12aca..a2b44fa 100644 (file)
@@ -12,9 +12,9 @@
 
 namespace InferenceEngine {
 
-StagedTask::StagedTask() : Task(), _stages(0) {}
+StagedTask::StagedTask() : Task(), _stages(0), _stage(0) {}
 
-StagedTask::StagedTask(std::function<void()> function, size_t stages) : Task(function), _stages(stages), _stage(0) {
+StagedTask::StagedTask(const std::function<void()> &function, size_t stages) : Task(function), _stages(stages), _stage(0) {
     if (!function) THROW_IE_EXCEPTION << "Failed to create StagedTask object with null function";
     resetStages();
 }
index fff5e51..8c067b0 100644 (file)
@@ -28,7 +28,7 @@ class INFERENCE_ENGINE_API_CLASS(StagedTask) : public Task {
 public:
     typedef std::shared_ptr<StagedTask> Ptr;
 
-    StagedTask(std::function<void()> function, size_t stages);
+    StagedTask(const std::function<void()> &function, size_t stages);
 
     StagedTask();
 
index 2a8ffe0..7badc89 100644 (file)
@@ -20,8 +20,8 @@ class AsyncInferRequestInternal : public IAsyncInferRequestInternal, public Infe
 public:
     typedef std::shared_ptr<AsyncInferRequestInternal> Ptr;
 
-    explicit AsyncInferRequestInternal(InputsDataMap networkInputs, OutputsDataMap networkOutputs)
-            : InferRequestInternal(networkInputs, networkOutputs), _callback(nullptr) {}
+    explicit AsyncInferRequestInternal(const InputsDataMap &networkInputs, const OutputsDataMap &networkOutputs)
+            : InferRequestInternal(networkInputs, networkOutputs), _callback(nullptr), _userData(nullptr) {}
 
     void SetCompletionCallback(InferenceEngine::IInferRequest::CompletionCallback callback) override {
         _callback = callback;
index 3384164..77da6a1 100644 (file)
@@ -91,6 +91,7 @@ public:
             : _syncRequest(request),
               _requestExecutor(taskExecutor),
               _requestSynchronizer(taskSynchronizer),
+              _userData(nullptr),
               _callbackManager(callbackExecutor) {
         _syncTask = std::make_shared<Task>([this]() { _syncRequest->Infer(); });
         _currentTask = _syncTask;
index c04a5d9..0fbe077 100644 (file)
@@ -30,7 +30,7 @@ class InferRequestInternal : virtual public IInferRequestInternal {
 public:
     typedef std::shared_ptr<InferRequestInternal> Ptr;
 
-    InferRequestInternal(InputsDataMap networkInputs, OutputsDataMap networkOutputs)
+    InferRequestInternal(const InputsDataMap &networkInputs, const OutputsDataMap &networkOutputs)
             : m_curBatch(-1) {
         // We should copy maps in order to avoid modifications in the future.
         for (const auto &it : networkInputs) {
index fa0eefd..4b924cb 100644 (file)
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dllmain.cpp : Defines the entry point for the DLL application.
+
 #pragma once
 
 #ifdef _WIN32
index 633d27f..f064119 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 12349aa..8e6d3a9 100644 (file)
@@ -82,18 +82,16 @@ StatusCode CNNNetReaderImpl::ReadWeights(const char* filepath, ResponseDesc* res
         return DescriptionBuffer(resp) << "network is empty";
     }
 
-    size_t ulFileSize = static_cast<size_t>(fileSize);
+    auto ulFileSize = static_cast<size_t>(fileSize);
 
-    TBlob<uint8_t>::Ptr weightsPtr(new TBlob<uint8_t>(Precision::U8, C, {ulFileSize}));
-    weightsPtr->allocate();
     try {
+        TBlob<uint8_t>::Ptr weightsPtr(new TBlob<uint8_t>(Precision::U8, C, {ulFileSize}));
+        weightsPtr->allocate();
         FileUtils::readAllFile(filepath, weightsPtr->buffer(), ulFileSize);
+        return SetWeights(weightsPtr, resp);
+    } catch (const InferenceEngineException& ex) {
+        return DescriptionBuffer(resp) << ex.what();
     }
-    catch (const InferenceEngineException& iee) {
-        return DescriptionBuffer(resp) << iee.what();
-    }
-
-    return SetWeights(weightsPtr, resp);
 }
 
 StatusCode CNNNetReaderImpl::ReadNetwork(const char* filepath, ResponseDesc* resp) noexcept {
index 58d727d..6eafc5f 100644 (file)
@@ -13,7 +13,9 @@ using namespace InferenceEngine;
 
 Context::Context() {
     auto builtIn = std::make_shared<ShapeInfer::BuiltInShapeInferHolder>();
-    addExtension(builtIn);
+    try {
+        addExtension(builtIn);
+    } catch (...) {}
 }
 
 void Context::addExtension(const IShapeInferExtensionPtr &ext) {
index 2090e7f..6b835c7 100644 (file)
@@ -44,11 +44,6 @@ FindPluginResponse InferenceEngine::findPlugin(const FindPluginRequest& req) {
             pluginVec.push_back("myriadPlugin");
 #endif
             break;
-        case TargetDevice::eHDDL:
-#ifdef ENABLE_HDDL
-            pluginVec.push_back("HDDLPlugin");
-#endif
-            break;
         case TargetDevice::eGNA:
 #ifdef ENABLE_GNA
             pluginVec.push_back("GNAPlugin");
@@ -57,12 +52,6 @@ FindPluginResponse InferenceEngine::findPlugin(const FindPluginRequest& req) {
         case TargetDevice::eHETERO:
             pluginVec.push_back("HeteroPlugin");
             break;
-        case TargetDevice::eKMB:
-#ifdef ENABLE_KMB
-            pluginVec.push_back("kmbPlugin");
-#endif
-            break;
-
         default:
             THROW_IE_EXCEPTION << "Cannot find plugin for device: " << getDeviceName(req.device);
     }
index 86248f1..3b1ba3a 100644 (file)
@@ -39,7 +39,7 @@ void CNNLayer::validateLayer() {
         InOutDims shapes;
         getInOutShapes(this, shapes);
         validator->checkShapes(this, shapes.inDims);
-    } catch(InferenceEngineException ie_e) {
+    } catch(const InferenceEngineException &ie_e) {
         THROW_IE_EXCEPTION << "Error of validate layer: " << this->name
                            << " with type: " << this->type << ". "
                            << ie_e.what();
@@ -723,7 +723,7 @@ void ReshapeValidator::parseParams(CNNLayer *layer) {
     if (!casted->params.empty()) {
         if (casted->type == "Flatten") {
             casted->num_axes = casted->GetParamAsInt("end_axis", -1);
-            casted->axis = casted->axis = casted->GetParamAsInt("axis", 0);
+            casted->axis = casted->GetParamAsInt("axis", 0);
         } else {
             casted->shape = casted->GetParamAsInts("dim", {});
         }
@@ -853,9 +853,6 @@ void ReLUValidator::checkParams(const CNNLayer* layer) {
     }
     if (!casted->params.empty()) {
         float negative_slope = casted->GetParamAsFloat("negative_slope");
-        if (negative_slope < 0) {
-            THROW_IE_EXCEPTION << "The value of ReLU layer negative_slope parameter is invalid";
-        }
     }
 }
 
@@ -2420,7 +2417,7 @@ void BinaryConvolutionValidator::parseParams(CNNLayer* layer) {
         THROW_IE_EXCEPTION << "Layer is not instance of BinaryConvolutionLayer class";
     }
 
-    binConvLayer->_pad_value = binConvLayer->GetParamAsFloat("pad_value", -1.f);
+    binConvLayer->_pad_value = binConvLayer->GetParamAsFloat("pad_value", 0.f);
     binConvLayer->_in_depth = binConvLayer->GetParamAsUInt("input");
     binConvLayer->_mode = BinaryConvolutionLayer::xnor_popcount;
     std::string mode = binConvLayer->GetParamAsString("mode", "xnor-popcount");
index 5b282d9..d9c6018 100644 (file)
@@ -989,8 +989,10 @@ static void calcAreaRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer
     for (int l = 0; l < lpi; l++) {
         Unit ymap = ymapper.map(y + l);
 
-        const T *src[32];
         GAPI_Assert(ymap.index1 - ymap.index0 <= 32);
+        GAPI_Assert(ymap.index1 - ymap.index0 > 0);
+        const T *src[32] = {};
+
         for (int yin = ymap.index0; yin < ymap.index1; yin++) {
             src[yin - ymap.index0] = in.InLine<const T>(yin - iny);
         }
@@ -1323,7 +1325,7 @@ static void calcAreaRow_CVKL_U8(const cv::gapi::fluid::View   & in,
         int yin1 = yin0 + y_max_count;
 
         GAPI_Assert(yin1 - yin0 <= 32);
-        const uint8_t *src[32];
+        const uint8_t *src[32] = {};
 
         for (int yin = yin0; yin < yin1 && yin < inSz.height; yin++) {
             if (yalpha[(y+l)*y_max_count + yin - yin0] == 0) {
index fd0f772..18de5c6 100644 (file)
@@ -457,6 +457,9 @@ struct NodePrinter {
 
         if (type == "Convolution") {
             auto* conv = dynamic_cast<ConvolutionLayer*>(layer.get());
+            if (conv == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << layer->name << " is not instance of ConvolutionLayer class";
+            }
 
             unsigned int
                 depth = conv->_out_depth,
@@ -471,6 +474,9 @@ struct NodePrinter {
             printed_properties.emplace_back("dilations", formatSize_({&(conv->_dilation[0]), &(conv->_dilation[conv->_dilation.size() - 1])}));
         } else if (type == "Pooling") {
             auto* pool = dynamic_cast<PoolingLayer*>(layer.get());
+            if (pool == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << layer->name << " is not instance of PoolingLayer class";
+            }
 
             printed_properties.emplace_back("window size", formatSize_({&(pool->_kernel[0]), &(pool->_kernel[pool->_kernel.size() - 1])}));
             printed_properties.emplace_back("padding begin", formatSize_({&(pool->_padding[0]), &(pool->_padding[pool->_padding.size() - 1])}));
@@ -478,6 +484,9 @@ struct NodePrinter {
             printed_properties.emplace_back("strides", formatSize_({&(pool->_stride[0]), &(pool->_stride[pool->_stride.size() - 1])}));
         } else if (type == "ReLU") {
             auto* relu = dynamic_cast<ReLULayer*>(layer.get());
+            if (relu == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << layer->name << " is not instance of ReLULayer class";
+            }
 
             float negative_slope = relu->negative_slope;
 
@@ -485,6 +494,9 @@ struct NodePrinter {
                 printed_properties.emplace_back("negative_slope", std::to_string(negative_slope));
         } else if (type == "Eltwise") {
             auto* eltwise = dynamic_cast<EltwiseLayer*>(layer.get());
+            if (eltwise == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << layer->name << " is not instance of EltwiseLayer class";
+            }
 
             std::string operation;
 
index fd81632..dd46d7e 100644 (file)
@@ -51,6 +51,9 @@ InferenceEngine::LayerComplexity getComplexity(const InferenceEngine::CNNLayerPt
                                  std::function<void(CNNLayer &)>> layerComplexityLookup = {
         {"Convolution", [&](CNNLayer &l) {
             auto* conv = dynamic_cast<ConvolutionLayer*>(&l);
+            if (conv == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of ConvolutionLayer class";
+            }
             unsigned long filter_m = conv->_kernel[X_AXIS] * conv->_kernel[Y_AXIS] * (inDims[1] / conv->_group);
             flops = 2 * out_size * filter_m;
             params = filter_m * conv->_out_depth + conv->_out_depth;
@@ -58,6 +61,9 @@ InferenceEngine::LayerComplexity getComplexity(const InferenceEngine::CNNLayerPt
 
         {"Deconvolution", [&](CNNLayer &l) {
             auto* deconv = dynamic_cast<DeconvolutionLayer*>(&l);
+            if (deconv == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of DeconvolutionLayer class";
+            }
             unsigned long filter_m = deconv->_kernel[X_AXIS] * deconv->_kernel[Y_AXIS] * (inDims[1] / deconv->_group);
             flops = 2 * out_size * filter_m;
             params = filter_m * deconv->_out_depth + deconv->_out_depth;
@@ -65,12 +71,18 @@ InferenceEngine::LayerComplexity getComplexity(const InferenceEngine::CNNLayerPt
 
         {"FullyConnected", [&](CNNLayer &l) {
             auto* fc = dynamic_cast<FullyConnectedLayer*>(&l);
+            if (fc == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of FullyConnectedLayer class";
+            }
             flops = 2 * in_size * fc->_out_num;
             params = (in_size + 1) * fc->_out_num;
         }},
 
         {"Norm", [&](CNNLayer &l) {
             auto* lrn = dynamic_cast<NormLayer*>(&l);
+            if (lrn == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of NormLayer class";
+            }
             int size = lrn->_size;
             int flopsPerElement = lrn->_isAcrossMaps ? 2 * size * size : 2 * size;
 
@@ -79,6 +91,9 @@ InferenceEngine::LayerComplexity getComplexity(const InferenceEngine::CNNLayerPt
 
         {"Pooling", [&](CNNLayer &l) {
             auto* pool = dynamic_cast<PoolingLayer*>(&l);
+            if (pool == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of PoolingLayer class";
+            }
             if (pool->_type == PoolingLayer::PoolType::ROI) {
                 // real kernel sizes are read from weights, so approximation is used.
                 unsigned long kernel_w = inDims[2] / outDims[2];
@@ -92,6 +107,9 @@ InferenceEngine::LayerComplexity getComplexity(const InferenceEngine::CNNLayerPt
 
         {"Eltwise", [&](CNNLayer &l) {
             auto* eltwise = dynamic_cast<EltwiseLayer*>(&l);
+            if (eltwise == nullptr) {
+                THROW_IE_EXCEPTION << "Layer " << l.name << " is not instance of EltwiseLayer class";
+            }
             flops = in_size * (2 * eltwise->insData.size() - 1);
         }},
 
index 4e7fad2..a91ebf9 100644 (file)
@@ -277,29 +277,20 @@ void CombineData(DataPtr &master, DataPtr &slave) {
 /****  Converter Passes  ************************************/
 /************************************************************/
 
-static RNNSequenceLayer::CellType cell_type_from_name(std::string &layer_type) {
-    RNNSequenceLayer::CellType res;
-    if (layer_type == "LSTMCell")
-        res = RNNSequenceLayer::LSTM;
-    else if (layer_type == "GRUCell")
-        res = RNNSequenceLayer::GRU;
-    else if (layer_type == "RNNCell")
-        res = RNNSequenceLayer::GRU;
-    else
-        THROW_IE_EXCEPTION << "Unknown Cell type (" << layer_type << "). Expected LSTMCell|GRUCell|RNNCell";
-    return res;
-}
-
-static std::string cell_name(RNNSequenceLayer::CellType type) {
+static std::string cell_type_name(RNNSequenceLayer::CellType type) {
     std::string res;
-    if (type == RNNSequenceLayer::LSTM)
-        res = "LSTM";
-    else if (type == RNNSequenceLayer::GRU)
-        res = "GRU";
-    else if (type == RNNSequenceLayer::GRU)
-        res = "GRU";
-    else
-        THROW_IE_EXCEPTION << "Unknown Cell type (enum index: " << type << "). Expected LSTM|GRU|RNN";
+    switch (type) {
+        case RNNSequenceLayer::LSTM:
+            res = "LSTM";
+            break;
+        case RNNSequenceLayer::GRU:
+        case RNNSequenceLayer::GRU_LBR:
+            res = "GRU";
+            break;
+        case RNNSequenceLayer::RNN:
+            res = "RNN";
+            break;
+    }
     return res;
 }
 
@@ -323,9 +314,11 @@ bool convertToRNNSeq(CNNLayerPtr cur, ICNNNetwork &net) {
     auto cell = std::dynamic_pointer_cast<RNNCellBase>(all_body_layers[1]);
     auto rsp2 = std::dynamic_pointer_cast<ReshapeLayer>(all_body_layers[2]);
 
-    auto cell_type = cell_type_from_name(all_body_layers[1]->type);
+    IE_ASSERT(rsp1);
+    IE_ASSERT(cell);
+    IE_ASSERT(rsp2);
 
-    int NS = cell_type == RNNSequenceLayer::LSTM ? 2 : 1;  // number of states
+    int NS = (cell->cellType == RNNSequenceLayer::LSTM) ? 2 : 1;  // number of states
 
     IE_ASSERT(cell->insData.size() == NS + 1);  // {data, state1, [state2]}
     IE_ASSERT(cell->outData.size() == NS);  // {state1, [state2]}
@@ -360,7 +353,7 @@ bool convertToRNNSeq(CNNLayerPtr cur, ICNNNetwork &net) {
 
     if (!one_of(i2map.size(), NS + 1, 1) ||
         !one_of(o2map.size(), NS + 1, 1) ||
-        !one_of(be2map.size(), 2))
+        !one_of(be2map.size(), NS))
         return false;
 
     auto in_iter_rule = i2map[in_dt_idx];
@@ -405,14 +398,16 @@ bool convertToRNNSeq(CNNLayerPtr cur, ICNNNetwork &net) {
 
     // need swap an i/o ports if it is not in natural order
     std::string name = cell->name + "_sequence";
-    auto rnn  = std::make_shared<RNNSequenceLayer>(LayerParams{ name, cell_name(cell_type) + "Sequence", cell->precision});
-    rnn->cellType = cell_type;
+    std::string type = cell_type_name(cell->cellType) + "Sequence";
+
+    auto rnn  = std::make_shared<RNNSequenceLayer>(LayerParams{ name, type, cell->precision});
     rnn->axis = in_iter_rule.axis;
     rnn->direction = in_iter_rule.stride == 1
             ? RNNSequenceLayer::FWD
             : RNNSequenceLayer::BWD;
 
     // copy base RNN cell fields
+    rnn->cellType = cell->cellType;
     rnn->_weights = cell->_weights;
     rnn->_biases = cell->_biases;
     rnn->blobs = cell->blobs;
index 4ccf4a5..800cbfe 100644 (file)
@@ -215,14 +215,18 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr &layer) {
 
     if (CaselessEq<std::string>()(layer->type, "power")) {
         auto *lr = dynamic_cast<PowerLayer *>(layerPtr);
-
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of PowerLayer class";
+        }
         params["scale"] = std::to_string(lr->scale);
         params["shift"] = std::to_string(lr->offset);
         params["power"] = std::to_string(lr->power);
     } else if (CaselessEq<std::string>()(layer->type, "convolution") ||
                CaselessEq<std::string>()(layer->type, "deconvolution")) {
         auto *lr = dynamic_cast<ConvolutionLayer *>(layerPtr);
-
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ConvolutionLayer class";
+        }
         params["kernel"] = arrayRevertToIRProperty(lr->_kernel);
         params["pads_begin"] = arrayRevertToIRProperty(lr->_padding);
         params["pads_end"] = arrayRevertToIRProperty(lr->_pads_end);
@@ -232,20 +236,27 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr &layer) {
         params["group"] = std::to_string(lr->_group);
     } else if (CaselessEq<std::string>()(layer->type, "relu")) {
         auto *lr = dynamic_cast<ReLULayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ReLULayer class";
+        }
         if (lr->negative_slope != 0.0f) {
             params["negative_slope"] = std::to_string(lr->negative_slope);
         }
     } else if (CaselessEq<std::string>()(layer->type, "norm") ||
                CaselessEq<std::string>()(layer->type, "lrn")) {
         auto *lr = dynamic_cast<NormLayer *>(layerPtr);
-
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of NormLayer class";
+        }
         params["alpha"] = std::to_string(lr->_alpha);
         params["beta"] = std::to_string(lr->_beta);
         params["local-size"] = std::to_string(lr->_size);
         params["region"] = lr->_isAcrossMaps ? "across" : "same";
     } else if (CaselessEq<std::string>()(layer->type, "pooling")) {
         auto *lr = dynamic_cast<PoolingLayer *>(layerPtr);
-
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of PoolingLayer class";
+        }
         params["kernel"] = arrayRevertToIRProperty(lr->_kernel);
         params["pads_begin"] = arrayRevertToIRProperty(lr->_padding);
         params["pads_end"] = arrayRevertToIRProperty(lr->_pads_end);
@@ -264,23 +275,41 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr &layer) {
         }
     } else if (CaselessEq<std::string>()(layer->type, "split")) {
         auto *lr = dynamic_cast<SplitLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of SplitLayer class";
+        }
         params["axis"] = std::to_string(lr->_axis);
     } else if (CaselessEq<std::string>()(layer->type, "concat")) {
         auto *lr = dynamic_cast<ConcatLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ConcatLayer class";
+        }
         params["axis"] = std::to_string(lr->_axis);
     } else if (CaselessEq<std::string>()(layer->type, "FullyConnected") ||
                CaselessEq<std::string>()(layer->type, "InnerProduct")) {
         auto *lr = dynamic_cast<FullyConnectedLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of FullyConnectedLayer class";
+        }
         params["out-size"] = std::to_string(lr->_out_num);
     } else if (CaselessEq<std::string>()(layer->type, "softmax")) {
         auto *lr = dynamic_cast<SoftMaxLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of SoftMaxLayer class";
+        }
         params["axis"] = std::to_string(lr->axis);
     } else if (CaselessEq<std::string>()(layer->type, "reshape")) {
         // need to add here support of flatten layer if it is created from API
         auto *lr = dynamic_cast<ReshapeLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ReshapeLayer class";
+        }
         params["dim"] = arrayToIRProperty(lr->shape);
     } else if (CaselessEq<std::string>()(layer->type, "Eltwise")) {
         auto *lr = dynamic_cast<EltwiseLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of EltwiseLayer class";
+        }
 
         std::string op;
 
@@ -301,31 +330,55 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr &layer) {
         params["operation"] = op;
     } else if (CaselessEq<std::string>()(layer->type, "scaleshift")) {
         auto *lr = dynamic_cast<ScaleShiftLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ScaleShiftLayer class";
+        }
         params["broadcast"] = std::to_string(lr->_broadcast);
     } else if (CaselessEq<std::string>()(layer->type, "crop")) {
         auto *lr = dynamic_cast<CropLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of CropLayer class";
+        }
         params["axis"] = arrayToIRProperty(lr->axis);
         params["offset"] = arrayToIRProperty(lr->offset);
         params["dim"] = arrayToIRProperty(lr->dim);
     } else if (CaselessEq<std::string>()(layer->type, "tile")) {
         auto *lr = dynamic_cast<TileLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of TileLayer class";
+        }
         params["axis"] = std::to_string(lr->axis);
         params["tiles"] = std::to_string(lr->tiles);
     } else if (CaselessEq<std::string>()(layer->type, "prelu")) {
         auto *lr = dynamic_cast<PReLULayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of PReLULayer class";
+        }
         params["channel_shared"] = std::to_string(lr->_channel_shared);
     } else if (CaselessEq<std::string>()(layer->type, "clamp")) {
         auto *lr = dynamic_cast<ClampLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of ClampLayer class";
+        }
         params["min"] = std::to_string(lr->min_value);
         params["max"] = std::to_string(lr->max_value);
     } else if (CaselessEq<std::string>()(layer->type, "BatchNormalization")) {
         auto *lr = dynamic_cast<BatchNormalizationLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of BatchNormalizationLayer class";
+        }
         params["epsilon"] = std::to_string(lr->epsilon);
     } else if (CaselessEq<std::string>()(layer->type, "grn")) {
         auto *lr = dynamic_cast<GRNLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of GRNLayer class";
+        }
         params["bias"] = std::to_string(lr->bias);
     } else if (CaselessEq<std::string>()(layer->type, "mvn")) {
         auto *lr = dynamic_cast<MVNLayer *>(layerPtr);
+        if (lr == nullptr) {
+            THROW_IE_EXCEPTION << "Layer " << layerPtr->name << " is not instance of MVNLayer class";
+        }
         params["across_channels"] = std::to_string(lr->across_channels);
         params["normalize_variance"] = std::to_string(lr->normalize);
     } else if (CaselessEq<std::string>()(layer->type, "rnn") ||
@@ -385,7 +438,11 @@ void NetworkSerializer::updateStatisticsInfo(const InferenceEngine::ICNNNetwork&
     // If statistics exists, add it to the file
     ICNNNetworkStats *netNodesStats = nullptr;
     auto stats = netXml.append_child("statistics");
-    network.getStats(&netNodesStats, nullptr);
+    auto resultCode = network.getStats(&netNodesStats, nullptr);
+    if (resultCode != StatusCode::OK) {
+        THROW_IE_EXCEPTION << InferenceEngine::details::as_status << resultCode
+                           << "Can't get statistics info for serialization of the model";
+    }
     const NetworkStatsMap statsmap = netNodesStats->getNodesStats();
 
     auto joinCommas = [&](const std::vector<float> &v) -> std::string {
@@ -409,4 +466,4 @@ void NetworkSerializer::updateStatisticsInfo(const InferenceEngine::ICNNNetwork&
         layer.append_child("min").text().set(joinCommas(itStats.second->_minOutputs).c_str());
         layer.append_child("max").text().set(joinCommas(itStats.second->_maxOutputs).c_str());
     }
-}
\ No newline at end of file
+}
index 36dc367..3e79434 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -90,6 +90,7 @@ public:
                     }
                 }
             }
+                break;
             default:
                 THROW_IE_EXCEPTION << "Incorrect 'indices_to_set' input precision. Only FP32 and I32 are supported!";
         }
index 5f39833..923fa3f 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index fc97c28..a712203 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6f82e98..7852139 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 790ad48..6a9a13d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 863b34a..634a705 100644 (file)
@@ -1,6 +1,7 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+
 #pragma once
 
 #include <transform/transformation.hpp>
index 710a71e..ab630a2 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 040180a..d17fede 100644 (file)
@@ -1,6 +1,7 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+
 #pragma once
 
 #include <transform/transformation.hpp>
index 337bb77..5a3eeb8 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index c67649d..bcefc62 100644 (file)
@@ -1,6 +1,7 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+
 #pragma once
 
 #include <transform/transformation.hpp>
index 7e8c5a6..a8d955c 100644 (file)
@@ -31,7 +31,7 @@ uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *st
     std::string str_value = std::string(attr.value());
     std::size_t idx = 0;
     long long int_value = std::stoll(str_value, &idx, 10);
-    if (idx != str_value.length() || int_value < 0 || int_value > (std::numeric_limits<uint64_t>::max)())
+    if (idx != str_value.length() || int_value < 0)
         THROW_IE_EXCEPTION << "node <" << node.name() << "> has attribute \"" << str << "\" = \"" << str_value
                            << "\" which is not an unsigned 64 bit integer" << " at offset "
                            << node.offset_debug();
index 7b45731..e687e43 100644 (file)
@@ -87,7 +87,7 @@ InferenceEngine::TensorDesc MKLDNNExtensionUtils::getUninitTensorDesc(const Infe
                                         std::numeric_limits<size_t>::max(), zeroArr, notInitArr});
 }
 
-bool MKLDNNExtensionUtils::initTensorsAreEqual(InferenceEngine::TensorDesc desc1, InferenceEngine::TensorDesc desc2) {
+bool MKLDNNExtensionUtils::initTensorsAreEqual(const InferenceEngine::TensorDesc &desc1, const InferenceEngine::TensorDesc &desc2) {
     if (desc1.getDims() != desc2.getDims() || desc1.getPrecision() != desc2.getPrecision())
         return false;
     if (desc1.getLayout() == InferenceEngine::Layout::ANY || desc2.getLayout() == InferenceEngine::Layout::ANY)
index 358a1e7..d4907d4 100644 (file)
@@ -22,7 +22,7 @@ public:
     static mkldnn::memory::data_type IEPrecisionToDataType(InferenceEngine::Precision prec);
     static InferenceEngine::Precision DataTypeToIEPrecision(mkldnn::memory::data_type dataType);
     static InferenceEngine::TensorDesc getUninitTensorDesc(const InferenceEngine::TensorDesc& desc);
-    static bool initTensorsAreEqual(InferenceEngine::TensorDesc desc1, InferenceEngine::TensorDesc desc2);
+    static bool initTensorsAreEqual(const InferenceEngine::TensorDesc &desc1, const InferenceEngine::TensorDesc &desc2);
 };
 
 }  // namespace MKLDNNPlugin
index 13b8e5f..78fca71 100644 (file)
@@ -262,7 +262,7 @@ void MKLDNNGraph::InitNodes() {
 }
 
 void MKLDNNGraph::InitEdges() {
-    auto reorderArgs = [](InferenceEngine::TensorDesc parentDesc, InferenceEngine::TensorDesc childDesc) {
+    auto reorderArgs = [](const InferenceEngine::TensorDesc &parentDesc, const InferenceEngine::TensorDesc &childDesc) {
         std::string inArgs, outArgs;
         if (parentDesc.getPrecision() != childDesc.getPrecision()) {
             inArgs += (inArgs.empty() ? "" : "_") + std::string(parentDesc.getPrecision().name());
@@ -949,7 +949,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
     }
 
     bool ti_proc_ok = !NetPass::CombineRNNSeq(*clonedNetwork) ? NetPass::UnrollTI(*clonedNetwork) : true;
-    ti_proc_ok &= NetPass::UnrollRNN_if(*clonedNetwork, [] (RNNCellBase rnn) -> bool {
+    ti_proc_ok &= NetPass::UnrollRNN_if(*clonedNetwork, [] (const RNNCellBase &rnn) -> bool {
         if (rnn.clip != 0.0f)
             return true;
         if ((rnn.cellType == RNNCellBase::GRU || rnn.cellType == RNNCellBase::GRU_LBR) &&
@@ -1042,4 +1042,4 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &
 
 void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) {
     graphPtr = graphs[0]->dump();
-}
\ No newline at end of file
+}
index 8b9bcc8..244a986 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_graph_dumper.h"
@@ -201,11 +189,10 @@ void drawer_callback(const InferenceEngine::CNNLayerPtr layer,
     auto prec = params.find(ExecGraphInfoSerialization::PRECISION);
     if (prec != params.end()) {
         printed_properties.push_back({"precision", prec->second});
+        // Set color
+        node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
     }
 
-    // Set color
-    node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
-
     // Set xlabel containing PM data if calculated
     auto perf = layer->params.find(ExecGraphInfoSerialization::PERF_COUNTER);
     node_properties.push_back({"xlabel", (perf != layer->params.end()) ? perf->second : ""});
index b419109..9351fc5 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index 4723403..e6d2639 100644 (file)
@@ -206,9 +206,11 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
                 } else {
                     if (ch1->type == Pooling) {
                         auto pool = ch1;
-                        bool is_max_pool =
-                                dynamic_cast<PoolingLayer *>(pool->getCnnLayer().get())->_type ==
-                                PoolingLayer::PoolType::MAX;
+
+                        auto* pLayer = dynamic_cast<PoolingLayer *>(pool->getCnnLayer().get());
+                        if (pLayer == nullptr)
+                            THROW_IE_EXCEPTION << "Cannot get pooling layer " << pool->getName();
+                        bool is_max_pool = pLayer->_type == PoolingLayer::PoolType::MAX;
 
                         if (is_max_pool && pool->getChildEdges().size() == 1) {
                             auto ch2 = pool->getChildEdgeAt(0)->getChild();
@@ -242,6 +244,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
             return false;
 
         auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
+        if (depthwiseNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get depthwise node " << node->getName();
         return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
                 (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
     };
@@ -288,11 +292,15 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
     auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
         if (isBinaryConvolutionNode(node)) {
             auto *layer = dynamic_cast<BinaryConvolutionLayer *>(node->getCnnLayer().get());
+            if (layer == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
 
             bool isSupportedParams = layer->_group == 1;
             if (!isSupportedParams) return false;
         } else {
             auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
+            if (layer == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
 
             bool isSupportedParams = layer->_group == 1 &&
                                      ((is1x1Convolution(layer) && layer->_stride[X_AXIS] == 1 &&
@@ -306,9 +314,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
 
     auto isSutableChildConvolution = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
         auto* childLayer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
+        if (childLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();
 
         if (!isBinaryConvolutionNode(parentNode)) {
             auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
+
+            if (parentLayer == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get convolution layer " << parentNode->getName();
             if (parentLayer->precision != childLayer->precision)
                 return false;
         }
@@ -331,6 +344,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
         }
 
         auto* layer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
+        if (layer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();
 
         auto inDims = childNode->inDims[0];
         auto outDims = childNode->outDims[0];
@@ -384,10 +399,14 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
         if (!node->getCnnLayer())
             return false;
 
-        auto* quantizeLayer = dynamic_cast<QuantizeLayer*>(node->getCnnLayer().get());
-        bool isSutableQuantize = node->getType() == Quantize && quantizeLayer->levels == 2;
+        if (node->getType() != Quantize)
+            return false;
+
+        auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
+        if (quantizeNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
 
-        return isSutableQuantize;
+        return quantizeNode->isPackedStore();
     };
 
     for (int i = 0; i < graphNodes.size(); i++) {
@@ -399,31 +418,9 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
 
         parent->fuseWith(child);
 
-        auto* binConvNode = dynamic_cast<MKLDNNBinaryConvolutionNode*>(parent.get());
-
         auto parents = child->parentEdges;
         for (size_t i = 0; i < parents.size(); i++) {
             auto p_edge = parents[i].lock();
-            if (p_edge->getParent()->getType() == Input) {
-                InferenceEngine::SizeVector dims;
-                dims.push_back(binConvNode->getChildEdgeAt(0)->getDims()[1]);
-
-                auto InputLowBlob = dynamic_cast<TBlob<float>*>(p_edge->getParent()->getCnnLayer()->blobs["custom"].get());
-
-                auto inputLowData = InputLowBlob->buffer().as<float*>();
-                int inputLowAxis = p_edge->getDims().ndims() == 1 ? 0 : 1;
-                bool isInputLowBroadcasted = p_edge->getDims()[inputLowAxis] != dims[0];
-
-                for (int i = 0; i < dims[0]; i++) {
-                    binConvNode->pushBinarizationThreshold(inputLowData[isInputLowBroadcasted ? 0 : i]);
-                }
-
-                break;
-            }
-        }
-
-        for (size_t i = 0; i < parents.size(); i++) {
-            auto p_edge = parents[i].lock();
             if (p_edge->getParent()->getType() == BinaryConvolution)
                 continue;
 
@@ -667,12 +664,16 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
 
         if (node->getType() == Power) {
             PowerLayer* l = dynamic_cast<PowerLayer*>(node->getCnnLayer().get());
+            if (l == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get power layer " << node->getName();
 
             if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
         }
 
         if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
+            if (l == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
 
             if (l->_weights == nullptr && l->_biases == nullptr) toDrop = true;
         }
@@ -692,7 +693,11 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
             && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
             auto nextNode = node->getChildEdgeAt(0)->getChild();
             MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
+            if (n == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get reorder layer " << node->getName();
             MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
+            if (nn == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get reorder layer " << nextNode->getName();
 
             auto scales = n->_scales;
 
@@ -778,6 +783,8 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
         if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
+            if (l == nullptr)
+                THROW_IE_EXCEPTION << "Cannot get scale shift layer " << node->getName();
 
             auto cur = l->insData[0].lock();
             if (cur == nullptr) {
index 573ab06..3a1e9d2 100644 (file)
@@ -78,6 +78,8 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
                     convertedInputs.push_back(iconv);
                     iconv->allocate();
                     in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                    if (in_f == nullptr)
+                        THROW_IE_EXCEPTION << "Cannot get TBlob";
                     InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
                     pushInput<float>(input.first, iconv);
                     break;
@@ -90,6 +92,8 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
                         convertedInputs.push_back(iconv);
                         iconv->allocate();
                         in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        if (in_f == nullptr)
+                            THROW_IE_EXCEPTION << "Cannot get TBlob";
                         InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
                         pushInput<float>(input.first, iconv);
                     } else {
@@ -106,6 +110,8 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
                         convertedInputs.push_back(iconv);
                         iconv->allocate();
                         in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        if (in_f == nullptr)
+                            THROW_IE_EXCEPTION << "Cannot get TBlob";
                         InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
                         pushInput<float>(input.first, iconv);
                     } else {
index 5d9c345..d38cb05 100644 (file)
@@ -391,7 +391,7 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
 }
 
 bool MKLDNNMemoryDesc::operator==(const MKLDNNMemoryDesc &rhs) const {
-    auto dims_equal = [] (mkldnn_memory_desc_t ldata, mkldnn_memory_desc_t rdata) {
+    auto dims_equal = [] (const mkldnn_memory_desc_t &ldata, const mkldnn_memory_desc_t &rdata) {
         if (ldata.ndims != rdata.ndims)
             return false;
         for (int i = 0; i < ldata.ndims; i++) {
@@ -400,7 +400,7 @@ bool MKLDNNMemoryDesc::operator==(const MKLDNNMemoryDesc &rhs) const {
         }
         return true;
     };
-    auto blocking_equal = [] (mkldnn_memory_desc_t ldata, mkldnn_memory_desc_t rdata) {
+    auto blocking_equal = [] (const mkldnn_memory_desc_t &ldata, const mkldnn_memory_desc_t &rdata) {
         if (ldata.ndims != rdata.ndims)
             return false;
         mkldnn_blocking_desc_t lblock = ldata.layout_desc.blocking;
index d5a48aa..250b168 100644 (file)
@@ -61,7 +61,11 @@ void Engine::SetConfig(const std::map<std::string, std::string> &config) {
         // ugly casting. can we avoid it?
         auto exe_network =
                 dynamic_cast<ExecutableNetworkBase<ExecutableNetworkInternal>*>(_loadedNetwork.get());
+        if (exe_network == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get executable network!";
         auto exe_network_impl = dynamic_cast<MKLDNNExecNetwork*>(exe_network->getImpl().get());
+        if (exe_network_impl == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get implementation of executable network!";
 
         exe_network_impl->setProperty(config);
     }
index b50552f..ed03f2c 100644 (file)
@@ -218,6 +218,8 @@ void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
                     convertedInputs.push_back(iconv);
                     iconv->allocate();
                     in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                    if (in_f == nullptr)
+                        THROW_IE_EXCEPTION << "Cannot get TBlob";
                     InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
                     graph->PushInputData(input.first, iconv);
                     break;
@@ -230,6 +232,8 @@ void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
                         convertedInputs.push_back(iconv);
                         iconv->allocate();
                         in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        if (in_f == nullptr)
+                            THROW_IE_EXCEPTION << "Cannot get TBlob";
                         InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
                         graph->PushInputData(input.first, iconv);
                     } else {
@@ -246,6 +250,8 @@ void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
                         convertedInputs.push_back(iconv);
                         iconv->allocate();
                         in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        if (in_f == nullptr)
+                            THROW_IE_EXCEPTION << "Cannot get TBlob";
                         InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
                         graph->PushInputData(input.first, iconv);
                     } else {
index 3b9cc7e..6ccda7f 100644 (file)
@@ -54,7 +54,7 @@ private:
     float beta = 0.0f;
     static InferenceEngine::details::caseless_map<std::string,
             std::function<void(InferenceEngine::GenericLayer*, mkldnn::algorithm&, float&, float&)>> initializers;
-    mkldnn::algorithm algorithm;
+    mkldnn::algorithm algorithm = mkldnn::algorithm::eltwise_relu;
 };
 
 }  // namespace MKLDNNPlugin
index b1e3ac2..8629eaa 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,6 +17,7 @@
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include <ie_layers_internal.hpp>
+#include "cpu_isa_traits.hpp"
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -234,27 +235,37 @@ void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool
                 PostOpsIntBlobMemory[blob_idx]->Create(binarizationDims, memory::data_type::f32, memory::format::x);
 
                 PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
-                                                        &binarizationThresholds[0],
-                                                        binarizationThresholds.size() *
+                                                        quantizeNode->getBinarizationTresholdsPtr(),
+                                                        quantizeNode->getBinarizationTresholdsSize() *
                                                         MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
-                ops.append_binarization(binarization_depthwise, (const float*)PostOpsIntBlobMemory[blob_idx]->GetData());
+                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
+                PostOpsIntBlobMemory[blob_idx+1]->Create(binarizationDims, memory::data_type::f32, memory::format::x);
+
+                PostOpsIntBlobMemory[blob_idx+1]->SetData(memory::data_type::f32, memory::x,
+                                                        quantizeNode->getBinarizationOutputMaskPtr(),
+                                                        quantizeNode->getBinarizationOutputMaskSize() *
+                                                        MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
+
+                ops.append_binarization(binarization_depthwise, (const float*)PostOpsIntBlobMemory[blob_idx+0]->GetData(),
+                                                                (const float*)PostOpsIntBlobMemory[blob_idx+1]->GetData());
 
-                blob_idx += 1;
+                blob_idx += 2;
             } else {
-                ops.append_binarization(binarization_depthwise, nullptr);
+                ops.append_binarization(binarization_depthwise, nullptr, nullptr);
             }
         }
 
         auto* convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(node.get());
         if (convolutionNode) {
             auto* convLayer = reinterpret_cast<ConvolutionLayer*>(convolutionNode->getCnnLayer().get());
-
             if (initWeights) {
+                auto w_fmt = mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common)
+                        ? memory::format::Goihw16g : memory::format::Goihw8g;
+
                 PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                 MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
-                PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, memory::data_type::f32,
-                                                            memory::format::Goihw8g);
+                PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, memory::data_type::f32, w_fmt);
 
                 PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::goihw,
                                                              convLayer->_weights->buffer(),
@@ -455,7 +466,3 @@ void MKLDNNBinaryConvolutionNode::initDescriptor(const InferenceEngine::LayerCon
     }
     selectedPD->getConfig() = rightConfig;
 }
-
-void MKLDNNBinaryConvolutionNode::pushBinarizationThreshold(float value) {
-    binarizationThresholds.push_back(value);
-}
\ No newline at end of file
index 659345d..0acf6bc 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -28,15 +28,14 @@ public:
         return false;
     }
     void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
-    void pushBinarizationThreshold(float value);
 
 private:
     static Register<MKLDNNBinaryConvolutionNode> reg;
-    bool withSum;
-    bool withBinarization;
-    bool isDW;
-    bool isMerged;
-    bool isGrouped;
+    bool withSum = false;
+    bool withBinarization = false;
+    bool isDW = false;
+    bool isMerged = false;
+    bool isGrouped = false;
     std::vector<ptrdiff_t> stride;
     std::vector<ptrdiff_t> dilation;
     std::vector<ptrdiff_t> paddingL;
@@ -44,16 +43,14 @@ private:
     InferenceEngine::SizeVector weightDims;
     InferenceEngine::SizeVector biasesDims;
 
-    ptrdiff_t dw_conv_oc;
-    ptrdiff_t dw_conv_ih;
-    ptrdiff_t dw_conv_iw;
+    ptrdiff_t dw_conv_oc = 0;
+    ptrdiff_t dw_conv_ih = 0;
+    ptrdiff_t dw_conv_iw = 0;
     std::vector<ptrdiff_t> dw_conv_kernel;
     std::vector<ptrdiff_t> dw_conv_strides;
     std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
 
-    float pad_value;
-
-    std::vector<float> binarizationThresholds;
+    float pad_value = 0.f;
 };
 
 }  // namespace MKLDNNPlugin
index aad12ed..ca00a3f 100644 (file)
@@ -31,9 +31,9 @@ public:
     MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
 
 private:
-    bool withBiases;
-    bool withGroups;
-    bool isDW;
+    bool withBiases = false;
+    bool withGroups = false;
+    bool isDW = false;
     size_t groupNum = 1;
     std::vector<ptrdiff_t> stride;
     std::vector<ptrdiff_t> paddingL;
index 03e4473..4107d08 100644 (file)
@@ -128,6 +128,8 @@ void MKLDNNDepthwiseNode::initValues() {
     CaselessEq<std::string> comparator;
     if (comparator(depthwiseLayer->type, "ScaleShift")) {
         auto *scshLayer = dynamic_cast<ScaleShiftLayer*>(getCnnLayer().get());
+        if (scshLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get scale shift layer " << getName();
         if (scshLayer->_weights == nullptr)
             THROW_IE_EXCEPTION << "ScaleShift without weights is not supported";
 
@@ -136,6 +138,8 @@ void MKLDNNDepthwiseNode::initValues() {
         broadcast = static_cast<bool>(scshLayer->_broadcast);
     } else if (comparator(depthwiseLayer->type, "PReLU")) {
         auto *preluLayer = dynamic_cast<PReLULayer*>(getCnnLayer().get());
+        if (preluLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get PReLU layer " << getName();
         if (preluLayer->_weights == nullptr)
             THROW_IE_EXCEPTION << "PReLU without weights is not supported";
 
index 00b60ab..07365b2 100644 (file)
@@ -48,11 +48,11 @@ private:
 
     static Register<MKLDNNDepthwiseNode> reg;
 
-    mkldnn::algorithm algorithm;
+    mkldnn::algorithm algorithm = mkldnn::algorithm::depthwise_scale_shift;
     size_t realWeightSize = 0;
     size_t realBiasSize = 0;
-    bool withBiases;
-    bool broadcast;
+    bool withBiases = false;
+    bool broadcast = false;
 };
 
 }  // namespace MKLDNNPlugin
index fdb5eeb..10e188a 100644 (file)
@@ -24,11 +24,15 @@ MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer,
 
 bool MKLDNNEltwiseNode::isSum() {
     auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
+    if (eltwiseLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
     return eltwiseLayer->_operation == EltwiseLayer::Sum;
 }
 
 bool MKLDNNEltwiseNode::isUnitScales() {
     auto * eltwiseLayer = dynamic_cast<EltwiseLayer*>(getCnnLayer().get());
+    if (eltwiseLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot get eltwise layer " << getName();
 
     if (eltwiseLayer->coeff.empty())
         return true;
index a777b54..8979039 100644 (file)
@@ -127,13 +127,8 @@ void MKLDNNFullyConnectedNode::createPrimitive() {
 
     std::shared_ptr<mkldnn::primitive_attr> attr = initPrimitiveAttr();
     std::shared_ptr<inner_product_forward::primitive_desc> prim_desc;
-    if (attr == nullptr) {
-        prim_desc = std::make_shared<inner_product_forward::primitive_desc>(
-                createPrimitiveDescriptor<inner_product_forward::primitive_desc, inner_product_forward::desc>(*attr));
-    } else {
-        prim_desc = std::make_shared<inner_product_forward::primitive_desc>(
-                createPrimitiveDescriptor<inner_product_forward::primitive_desc, inner_product_forward::desc>(*attr));
-    }
+    prim_desc = std::make_shared<inner_product_forward::primitive_desc>(
+            createPrimitiveDescriptor<inner_product_forward::primitive_desc, inner_product_forward::desc>(*attr));
 
     if (internalBlobs.size() > 1) {
         prim.reset(new inner_product_forward(*prim_desc,
index 2ff862f..07c5f83 100644 (file)
@@ -52,6 +52,8 @@ void MKLDNNGemmNode::getSupportedDescriptors() {
     xAxis = nDims - 1;
     yAxis = nDims - 2;
 
+    // The check inDims0[xAxis] != inDims1[yAxis] is correct due to layer semantic
+    // coverity[copy_paste_error]
     if (inDims0[xAxis] != inDims1[yAxis] || inDims0[yAxis] != outDims[yAxis] || inDims1[xAxis] != outDims[xAxis])
         THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
 
index 94c4e15..6168e3d 100644 (file)
@@ -25,15 +25,15 @@ public:
 
 private:
     static Register<MKLDNNGemmNode> reg;
-    float alpha;
-    float beta;
-    bool transposeA;
-    bool transposeB;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+    bool transposeA = false;
+    bool transposeB = false;
 
-    int xAxis;
-    int yAxis;
+    int xAxis = 0;
+    int yAxis = 0;
 
-    bool isThreeInputs;
+    bool isThreeInputs = false;
 
     std::vector<int> aOffsets;
     std::vector<int> bOffsets;
index 52de049..16beb47 100644 (file)
@@ -29,11 +29,11 @@ public:
 
 private:
     static Register<MKLDNNLrnNode> reg;
-    bool isAcrossMaps;
-    int size;
-    int k;
-    float alpha;
-    float beta;
+    bool isAcrossMaps = false;
+    int size = 1;
+    int k = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
 };
 
 }  // namespace MKLDNNPlugin
index cee6404..bfaaa0d 100644 (file)
@@ -28,8 +28,8 @@ public:
 
 private:
     static Register<MKLDNNPoolingNode> reg;
-    InferenceEngine::PoolingLayer::PoolType type;
-    bool exclude_pad;
+    InferenceEngine::PoolingLayer::PoolType type = InferenceEngine::PoolingLayer::MAX;
+    bool exclude_pad = false;
     std::vector<ptrdiff_t> stride;
     std::vector<ptrdiff_t> paddingL;
     std::vector<ptrdiff_t> paddingR;
index 85e0067..7318c49 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,7 +19,7 @@ using namespace InferenceEngine::details;
 
 MKLDNNQuantizeNode::MKLDNNQuantizeNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
 
-void MKLDNNQuantizeNode::getSupportedDescriptors() {
+void MKLDNNQuantizeNode::initValues() {
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
     if (precision != InferenceEngine::Precision::FP32)
         THROW_IE_EXCEPTION << "Quantize layer " << getName() << " supports only FP32 precision";
@@ -30,29 +30,95 @@ void MKLDNNQuantizeNode::getSupportedDescriptors() {
 
     levels = quantizeLayer->levels;
     if (levels <= 1)
-        THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only parameter levels > 1";
+        THROW_IE_EXCEPTION << "Quantize layer " << getName() << " supports only parameter levels > 1";
+
+    size_t inputDataEdgeIdx = 0;
+    size_t inputLowEdgeIdx = 0;
+    size_t outputLowEdgeIdx = 0;
+    size_t outputHighEdgeIdx = 0;
+    auto parents = getParentEdges();
+    for (size_t i = 0; i < parents.size(); i++) {
+        auto p_edge = parents[i].lock();
+        if (p_edge->getParent()->getType() == Input && p_edge->getParent()->getCnnLayer()->type == "Const") {
+            inputLowEdgeIdx = i;
+            outputLowEdgeIdx = i + 2;
+            outputHighEdgeIdx = i + 3;
+            inputDataEdgeIdx = i == 0 ? 4 : 0;
+            break;
+        }
+    }
+
+    for (size_t i = 0; i < parents.size(); i++) {
+        auto p_edge = parents[i].lock();
+        if (p_edge->getParent()->getType() == Input) {
+            if (p_edge->getDims().ndims() != 1 && p_edge->getDims().ndims() != 4) {
+                THROW_IE_EXCEPTION << "Quantize layer " << getName() << " supports only 1D or 4D inputs at edge " << i;
+            }
+        }
+    }
 
     if (getParentEdges().size() != 5)
         THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
     if (getChildEdges().empty())
         THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
 
-    if (getParentEdgeAt(0)->getDims().ndims() != 4) {
-        THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only 4D input at edge 0";
+    if (getParentEdgeAt(inputDataEdgeIdx)->getDims().ndims() != 4) {
+        THROW_IE_EXCEPTION << "Quantize layer " << getName() << " supports only 4D input at edge 0";
     }
 
-    for (int i = 1; i < 5; i++) {
-        if (getParentEdgeAt(i)->getDims().ndims() != 1 && getParentEdgeAt(i)->getDims().ndims() != 4) {
-            THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only 1D or 4D inputs at edge " << i;
+    auto outputLowBlob = dynamic_cast<TBlob<float>*>(getParentEdgeAt(outputLowEdgeIdx)->getParent()->getCnnLayer()->blobs["custom"].get());
+    auto outputLowData = outputLowBlob->buffer().as<float*>();
+    int outputLowAxis = getParentEdgeAt(outputLowEdgeIdx)->getDims().ndims() == 1 ? 0 : 1;
+    auto outputHighBlob = dynamic_cast<TBlob<float>*>(getParentEdgeAt(outputHighEdgeIdx)->getParent()->getCnnLayer()->blobs["custom"].get());
+    auto outputHighData = outputHighBlob->buffer().as<float*>();
+    int outputHighAxis = getParentEdgeAt(outputHighEdgeIdx)->getDims().ndims() == 1 ? 0 : 1;
+
+    bool isBinarization = levels == 2;
+    for (int i = 0; i < getParentEdgeAt(outputLowEdgeIdx)->getDims()[outputLowAxis]; i++) {
+        if (outputLowData[i] != 1.f && outputLowData[i] != 0.f) {
+            isBinarization = false;
+            break;
         }
     }
 
-    canStorePacked = getChildEdges().size() == 1 && getChildEdgeAt(0)->getChild()->getType() == BinaryConvolution;
+    for (int i = 0; i < getParentEdgeAt(outputHighEdgeIdx)->getDims()[outputHighAxis]; i++) {
+        if (outputHighData[i] != 1.f && outputHighData[i] != 0.f) {
+            isBinarization = false;
+            break;
+        }
+    }
 
-    if (canStorePacked) {
+    canStorePacked = isBinarization && getChildEdges().size() == 1 && getChildEdgeAt(0)->getChild()->getType() == BinaryConvolution;
+
+    InferenceEngine::SizeVector dims;
+    dims.push_back(getParentEdgeAt(inputDataEdgeIdx)->getDims()[1]);
+
+    auto InputLowBlob = dynamic_cast<TBlob<float>*>(getParentEdgeAt(inputLowEdgeIdx)->getParent()->getCnnLayer()->blobs["custom"].get());
+
+    auto inputLowData = InputLowBlob->buffer().as<float*>();
+    int inputLowAxis = getParentEdgeAt(inputLowEdgeIdx)->getDims().ndims() == 1 ? 0 : 1;
+    bool isInputLowBroadcasted = getParentEdgeAt(inputLowEdgeIdx)->getDims()[inputLowAxis] != dims[0];
+
+    for (int i = 0; i < dims[0]; i++) {
+        binarizationThresholds.push_back(inputLowData[isInputLowBroadcasted ? 0 : i]);
+    }
+
+    bool isOutputHighBroadcasted = getParentEdgeAt(outputHighEdgeIdx)->getDims()[outputHighAxis] != dims[0];
+    for (int i = 0; i < dims[0]; i++) {
+        uint32_t mask = outputHighData[isOutputHighBroadcasted ? 0 : i] == 1.f ? 0xffffffff : 0x00000000;
+
+        binarizationOutputMask.push_back(mask);
+    }
+
+    initialized = true;
+}
+
+void MKLDNNQuantizeNode::getSupportedDescriptors() {
+    if (isPackedStore()) {
         mkldnn::memory::data_type idt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
         mkldnn::memory::data_type ddt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::BIN);
         mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+        mkldnn::memory::data_type omdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
 
         MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), idt, memory::nhwc);
         MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), ddt, memory::nhwc);
@@ -61,26 +127,13 @@ void MKLDNNQuantizeNode::getSupportedDescriptors() {
         weightDims.push_back(getParentEdgeAt(0)->getDims()[1]);
         MKLDNNDims blocked_weightDims(weightDims);
         MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::x};
-
+        MKLDNNMemoryDesc om_candidate{blocked_weightDims, omdt, memory::x};
 
         std::shared_ptr<mkldnn::binarization_forward::desc> bin_conv_desc;
-        bin_conv_desc.reset(new binarization_forward::desc(prop_kind::forward_scoring, algorithm::binarization_depthwise,
-                                                           in_candidate, wgh_candidate, out_candidate));
+        bin_conv_desc.reset(new binarization_forward::desc(prop_kind::forward_scoring, algorithm ::binarization_depthwise,
+                                                           in_candidate, wgh_candidate, om_candidate, out_candidate));
 
         descs.emplace_back(bin_conv_desc);
-
-        InferenceEngine::SizeVector dims;
-        dims.push_back(getParentEdgeAt(0)->getDims()[1]);
-
-        auto InputLowBlob = dynamic_cast<TBlob<float>*>(getParentEdgeAt(1)->getParent()->getCnnLayer()->blobs["custom"].get());
-
-        auto inputLowData = InputLowBlob->buffer().as<float*>();
-        int inputLowAxis = getParentEdgeAt(1)->getDims().ndims() == 1 ? 0 : 1;
-        bool isInputLowBroadcasted = getParentEdgeAt(1)->getDims()[inputLowAxis] != dims[0];
-
-        for (int i = 0; i < dims[0]; i++) {
-            binarizationThresholds.push_back(inputLowData[isInputLowBroadcasted ? 0 : i]);
-        }
     }
 }
 
@@ -121,7 +174,7 @@ void MKLDNNQuantizeNode::initSupportedPrimitiveDescriptors() {
 
     supportedPrimitiveDescriptors.push_back(same(memory::nhwc, ref_any));
 
-    if (canStorePacked) {
+    if (isPackedStore()) {
         primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine());
         do {
             impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
@@ -148,11 +201,17 @@ void MKLDNNQuantizeNode::createPrimitive() {
 
         MKLDNNMemoryDesc binarizationDataDesc = {{getParentEdgeAt(0)->getDims()[1]}, memory::f32, memory::x};
         auto binarizationDataMem = std::make_shared<MKLDNNMemory>(getEngine());
-        binarizationDataMem->Create(binarizationDataDesc, &binarizationThresholds[0]);
+        binarizationDataMem->Create(binarizationDataDesc, getBinarizationTresholdsPtr());
         internalBlobMemory.push_back(binarizationDataMem);
 
+        MKLDNNMemoryDesc binarizationMaskDataDesc = {{getParentEdgeAt(0)->getDims()[1]}, memory::f32, memory::x};
+        auto binarizationMaskDataMem = std::make_shared<MKLDNNMemory>(getEngine());
+        binarizationMaskDataMem->Create(binarizationMaskDataDesc, getBinarizationOutputMaskPtr());
+        internalBlobMemory.push_back(binarizationMaskDataMem);
+
         prim.reset(new binarization_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
                                             internalBlobMemory[0]->GetPrimitive(),
+                                            internalBlobMemory[1]->GetPrimitive(),
                                             getChildEdgeAt(0)->getMemory().GetPrimitive()));
     }
 }
index 644926c..cbb902b 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -23,14 +23,48 @@ public:
     bool created() const override;
     void execute(mkldnn::stream strm) override;
 
+    const float* getBinarizationTresholdsPtr() {
+        if (!initialized)
+            initValues();
+        return &binarizationThresholds[0];
+    }
+
+    size_t getBinarizationTresholdsSize() {
+        if (!initialized)
+            initValues();
+        return binarizationThresholds.size();
+    }
+
+    const float* getBinarizationOutputMaskPtr() {
+        if (!initialized)
+            initValues();
+        return reinterpret_cast<float*>(&binarizationOutputMask[0]);
+    }
+
+    size_t getBinarizationOutputMaskSize() {
+        if (!initialized)
+            initValues();
+        return binarizationOutputMask.size();
+    }
+
+    bool isPackedStore() {
+        if (!initialized)
+            initValues();
+        return canStorePacked;
+    }
 
 private:
+    void initValues();
+
+    bool initialized = false;
+
     static Register<MKLDNNQuantizeNode> reg;
 
-    bool canStorePacked;
-    int levels;
+    bool canStorePacked = false;
+    int levels = -1;
 
     std::vector<float> binarizationThresholds;
+    std::vector<uint32_t> binarizationOutputMask;
 };
 
 }  // namespace MKLDNNPlugin
index 103f49d..d73967b 100644 (file)
@@ -74,7 +74,7 @@ void MKLDNNReorderNode::createPrimitive() {
             dstMemPtr->GetDescriptor(), dstMemPtr->GetPrimitive().get_data_handle());
 }
 
-void MKLDNNReorderNode::createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr) {
+void MKLDNNReorderNode::createReorderPrimitive(const mkldnn::memory::desc &srcDesc, void* srcPtr, const mkldnn::memory::desc &dstDesc, void* dstPtr) {
     src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
     src_blocked->Create(srcDesc, srcPtr);
 
index 32c3736..bb70174 100644 (file)
@@ -51,7 +51,7 @@ private:
     MKLDNNMemoryPtr dst_blocked;
     MKLDNNMemoryPtr src_blocked;
 
-    void createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr);
+    void createReorderPrimitive(const mkldnn::memory::desc &srcDesc, void* srcPtr, const mkldnn::memory::desc &dstDesc, void* dstPtr);
 };
 
 }  // namespace MKLDNNPlugin
index 7f61fce..f511700 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "blob_dump.h"
@@ -49,7 +37,7 @@ struct IEB_HEADER {
 };
 
 static IEB_HEADER prepare_header(const TensorDesc& desc) {
-    IEB_HEADER header;
+    IEB_HEADER header = {0};
 
     header.magic[0] = IEB_MAGIC[0];
     header.magic[1] = IEB_MAGIC[1];
index 1390c18..9dc66ea 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
diff --git a/inference-engine/src/vpu/CMakeLists.txt b/inference-engine/src/vpu/CMakeLists.txt
new file mode 100644 (file)
index 0000000..f824d89
--- /dev/null
@@ -0,0 +1,43 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Locate firmware files
+#
+
+find_file(VPU_FIRMWARE_MA2450_FILE MvNCAPI-ma2450.mvcmd "${VPU_FIRMWARE_MA2450}/mvnc")
+find_file(VPU_FIRMWARE_MA2480_FILE MvNCAPI-ma2480.mvcmd "${VPU_FIRMWARE_MA2480}/mvnc")
+
+if(NOT VPU_FIRMWARE_MA2450_FILE OR NOT VPU_FIRMWARE_MA2480_FILE)
+    message(FATAL_ERROR "[VPU] Missing firmware")
+endif()
+
+#
+# Build common part
+#
+
+add_subdirectory(graph_transformer)
+
+add_subdirectory(
+    "${IE_MAIN_SOURCE_DIR}/thirdparty/movidius"
+    "${CMAKE_CURRENT_BINARY_DIR}/thirdparty/movidius")
+
+#
+# Build plugins
+#
+
+set(plugin_target "")
+
+if(ENABLE_MYRIAD)
+    add_subdirectory(myriad_plugin)
+    set(plugin_target "myriadPlugin")
+endif()
+
+#
+# Copy firmware to the directory with binaries
+#
+
+set(firmware_out_dir "$<TARGET_FILE_DIR:${plugin_target}>")
+add_custom_target(vpu_copy_firmware ALL
+    COMMAND "${CMAKE_COMMAND}" -E copy "${VPU_FIRMWARE_MA2450_FILE}" "${firmware_out_dir}/MvNCAPI-ma2450.mvcmd"
+    COMMAND "${CMAKE_COMMAND}" -E copy "${VPU_FIRMWARE_MA2480_FILE}" "${firmware_out_dir}/MvNCAPI-ma2480.mvcmd"
+    COMMENT "[VPU] Copy firmware")
diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
new file mode 100644 (file)
index 0000000..f16f72b
--- /dev/null
@@ -0,0 +1,51 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "vpu_graph_transformer")
+
+file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+
+add_library(${TARGET_NAME} STATIC ${SOURCES})
+
+set_ie_threading_interface_for(${TARGET_NAME})
+
+# TODO: enable some day and fix all warnings
+# if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+#     target_compile_options(${TARGET_NAME} PRIVATE "-Wall")
+# endif()
+
+target_include_directories(${TARGET_NAME}
+    PUBLIC
+        "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_include_directories(${TARGET_NAME}
+    SYSTEM PUBLIC
+        "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src"
+        "${IE_MAIN_SOURCE_DIR}/include"
+        "${IE_MAIN_SOURCE_DIR}/src/inference_engine"
+        "${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/mvnc/include")
+
+set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
+
+# unit tests support for graph transformer
+if (WIN32)
+    set(UTEST_SOURCES ${SOURCES})
+
+    add_library(${TARGET_NAME}_test_static STATIC ${UTEST_SOURCES})
+
+    set_ie_threading_interface_for(${TARGET_NAME}_test_static)
+
+    # static linkage to inference_engine library
+    target_compile_definitions(${TARGET_NAME}_test_static PUBLIC -DUSE_STATIC_IE)
+    get_target_property(target_includes ${TARGET_NAME} INTERFACE_INCLUDE_DIRECTORIES)
+
+    target_include_directories(${TARGET_NAME}_test_static SYSTEM PUBLIC ${target_includes})
+
+    set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static)
+else()
+    add_library(${TARGET_NAME}_test_static ALIAS ${TARGET_NAME})
+endif()
+
+target_link_libraries(${TARGET_NAME} PUBLIC pugixml)
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/allocator.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/allocator.hpp
new file mode 100644 (file)
index 0000000..e6f2cd0
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <unordered_set>
+#include <list>
+#include <vector>
+
+#include <vpu/utils/enums.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/allocator/structs.hpp>
+#include <vpu/allocator_shaves.hpp>
+
+namespace vpu {
+
+//
+// UsedMemory
+//
+
+struct UsedMemory final {
+    int BSS = 0;
+    int CMX = 0;
+    int blob = 0;
+    int input = 0;
+    int output = 0;
+};
+
+void printTo(std::ostream& os, const UsedMemory& usedMemory);
+void printTo(DotLabel& lbl, const UsedMemory& usedMemory);
+
+//
+// AllocationResult
+//
+
+VPU_DECLARE_ENUM(AllocationStatus,
+    OK,
+    SHAVES_FAILED,
+    DATA_FAILED)
+
+struct AllocationResult final {
+    AllocationStatus status = AllocationStatus::OK;
+    Stage failedStage;
+};
+
+//
+// DeallocationMode
+//
+
+//
+// The following deallocation modes are supported to speed-up performance:
+//   * JustFree - Usual data deallocation scheme
+//   * MoveFromCMX  - Simple check and reallocation to DDR if tensor does not meet CMX requirements
+//
+
+VPU_DECLARE_ENUM(DeallocationMode,
+    JustFree,
+    MoveFromCMX)
+
+//
+// Allocator
+//
+
+class Allocator final {
+public:
+    Allocator();
+
+    void setBatchSize(int batchSize) { _modelBatchSize = batchSize; }
+
+    void reset();
+
+    /**
+     * Allocates memory for single data node
+     */
+    bool allocateData(const Data& data);
+    void freeData(const Data& data, DeallocationMode mode = DeallocationMode::JustFree);
+
+    void selfCheck();
+
+    UsedMemory usedMemory() const;
+
+    DataVector getAllocatedDatas(MemoryType memType) const;
+
+    void setNeedToAllocNonIntermData() { _needToAllocNonIntermData = true; }
+    /**
+     * Allocates memory for the whole vector of data nodes
+     */
+    AllocationResult preprocess(const ModelPtr& model);
+
+    DataSet& getCandidatesForCMX() { return _candidatesForCMX; }
+    bool removeCMXCandidates(const Data& data);
+
+    AllocatorForShaves& getAllocatorOfShaves() { return _allocatorOfShaves; }
+
+private:
+    allocator::MemChunk* allocateMem(MemoryType memType, int size, int inUse);
+    void freeMem(allocator::MemChunk* chunk);
+
+    allocator::MemChunk* addNewChunk(allocator::MemoryPool& pool, MemoryType memType, int offset, int pointer, int size, int inUse);
+    allocator::MemChunk* checkMemPool(allocator::MemoryPool& pool, MemoryType memType, int size, int inUse);
+
+    void extractDatas(MemoryType memType, const DataSet& from, DataVector& out) const;
+
+private:
+    int _modelBatchSize = 1;
+
+    int _maxCmxSize = 0;
+
+    allocator::MemoryPool _ddrMemoryPool;
+    allocator::MemoryPool _cmxMemoryPool;
+    EnumMap<MemoryType, allocator::MemoryPool*> _memPools;
+
+    AllocatorForShaves _allocatorOfShaves;
+
+    DataSet _allocatedData;
+    DataSet _allocatedIntermData;
+
+    DataMap<allocator::MemChunk*> _memChunksPerData;
+
+    int _blobMemOffset = 0;
+    int _inputMemOffset = 0;
+    int _outputMemOffset = 0;
+
+    /**
+     * Means that Model::_datas list was changed in some way
+     */
+    bool _needToAllocNonIntermData = true;
+
+    DataSet _candidatesForCMX;
+};
+
+int calcAllocationSize(const Data& data);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/allocator/structs.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/allocator/structs.hpp
new file mode 100644 (file)
index 0000000..2f9797f
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <list>
+#include <vector>
+
+#include <vpu/utils/enums.hpp>
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+
+//
+// Common allocation constants
+//
+
+const int DDR_MAX_SIZE = 512 * 1024 * 1024;
+const int CMX_SLICE_SIZE = 128 * 1024;
+const int DATA_ALIGNMENT = 64;
+
+//
+// Allocator Structs
+//
+
+namespace allocator {
+
+struct MemChunk final {
+    MemoryType memType = MemoryType::DDR;
+    int pointer = 0;
+    int offset = 0;
+    int size = 0;
+    int inUse = 0;
+
+    std::list<MemChunk>::iterator _posInList;
+};
+
+struct FreeMemory final {
+    int offset = 0;
+    int size = 0;
+};
+
+struct MemoryPool final {
+    int curMemOffset = 0;
+    int memUsed = 0;
+    std::list<MemChunk> allocatedChunks;
+    std::vector<FreeMemory> freePool;
+
+    void clear() {
+        curMemOffset = 0;
+        memUsed = 0;
+        allocatedChunks.clear();
+        freePool.clear();
+    }
+};
+
+}  // namespace allocator
+
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/allocator_shaves.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/allocator_shaves.hpp
new file mode 100644 (file)
index 0000000..816841f
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <unordered_set>
+#include <list>
+#include <vector>
+
+#include <vpu/utils/enums.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/allocator/structs.hpp>
+
+namespace vpu {
+
+//
+// AllocatorForShaves
+//
+
+class AllocatorForShaves final {
+public:
+    explicit AllocatorForShaves(allocator::MemoryPool &cmxMemoryPool);
+
+    void reset();
+
+    bool allocateSHAVEs(
+                const Stage& stage,
+                StageSHAVEsRequirements reqs);
+    void freeSHAVEs();
+
+    int getLockedSHAVEs() { return _lockedSHAVEs; }
+
+    void selfCheck();
+
+private:
+    int _lockedSHAVEs = 0;
+
+    allocator::MemoryPool &_cmxMemoryPool;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp
new file mode 100644 (file)
index 0000000..f7b5cab
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <set>
+#include <vector>
+#include <utility>
+
+#include <ie_layers.h>
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/model/model.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+class BackEnd final : public std::enable_shared_from_this<BackEnd> {
+public:
+    using Ptr = std::shared_ptr<BackEnd>;
+
+    CompiledGraph::Ptr build(
+            const Model::Ptr& model,
+            const std::vector<ie::CNNLayerPtr>& allLayers);
+
+    void dumpModel(
+            const Model::Ptr& model,
+            const std::string& postfix = std::string());
+
+private:
+    void serialize(
+            const Model::Ptr& model,
+            std::vector<char>& blob,
+            std::pair<char*, size_t>& blobHeader,
+            int& numActiveStages);
+
+    void getMetaData(
+            const Model::Ptr& model,
+            const std::vector<ie::CNNLayerPtr>& allLayers,
+            std::vector<StageMetaInfo>& metaData);
+
+    void extractDataInfo(
+            const Model::Ptr& model,
+            DataInfo& inputInfo,
+            DataInfo& outputInfo);
+
+#ifndef NDEBUG
+    void dumpModelToDot(
+            const Model::Ptr& model,
+            const std::string& fileName);
+#endif
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_format.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_format.hpp
new file mode 100644 (file)
index 0000000..46a4cf5
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+// It is one of float NaN and prime number
+const uint32_t STAGE_BORDER_SYMBOL = 0x7f83ff19;
+
+const uint32_t EI_NIDENT = 16;
+
+VPU_PACKED(ElfN_Ehdr {
+    uint8_t  e_ident[EI_NIDENT];
+    uint16_t e_type;
+    uint16_t e_machine;
+    uint32_t e_version;
+    uint32_t e_entry;
+    uint32_t e_phoff;
+    uint32_t e_shoff;
+    uint32_t e_flags;
+    uint16_t e_ehsize;
+    uint16_t e_phentsize;
+    uint16_t e_phnum;
+    uint16_t e_shentsize;
+    uint16_t e_shnum;
+    uint16_t e_shstrndx;
+};)
+
+VPU_PACKED(mv_blob_header {
+    uint32_t magic_number;
+    uint32_t file_size;
+    uint32_t blob_ver_major;
+    uint32_t blob_ver_minor;
+    uint32_t inputs_count;
+    uint32_t outputs_count;
+    uint32_t stages_count;
+    uint32_t inputs_size;
+    uint32_t outputs_size;
+    uint32_t batch_size;
+    uint32_t bss_mem_size;
+    uint32_t number_of_cmx_slices;
+    uint32_t number_of_shaves;
+    uint32_t has_hw_stage;
+    uint32_t has_shave_stage;
+    uint32_t has_dma_stage;
+    uint32_t input_info_section_offset;
+    uint32_t output_info_section_offset;
+    uint32_t stage_section_offset;
+    uint32_t const_data_section_offset;
+};)
+
+VPU_PACKED(mv_stage_header {
+    uint32_t stage_length;
+    uint32_t stage_type;
+    uint32_t numShaves;
+};)
+
+}  //  namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_serializer.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_serializer.hpp
new file mode 100644 (file)
index 0000000..6df0ebe
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+
+#include <vpu/model/base.hpp>
+
+namespace vpu {
+
+class BlobSerializer final {
+public:
+    template <typename T>
+    int append(const T& val) {
+        auto curPos = _data.size();
+
+        _data.insert(
+            _data.end(),
+            reinterpret_cast<const char*>(&val),
+            reinterpret_cast<const char*>(&val) + sizeof(val));
+
+        return checked_cast<int>(curPos);
+    }
+
+    template <typename T>
+    void overWrite(int pos, const T& val) {
+        auto uPos = checked_cast<size_t>(pos);
+        std::copy_n(reinterpret_cast<const char*>(&val), sizeof(val), _data.data() + uPos);
+    }
+
+    // Overwrites `uint32_t` value in `_data` at the position `pos`
+    // to the size of the tail from `pos` to the end of `_data`.
+    void overWriteTailSize(int pos) {
+        auto uPos = checked_cast<size_t>(pos);
+        IE_ASSERT(uPos < _data.size());
+        auto size = checked_cast<uint32_t>(_data.size() - uPos);
+        std::copy_n(reinterpret_cast<const char*>(&size), sizeof(uint32_t), _data.data() + uPos);
+    }
+
+    int size() const { return checked_cast<int>(_data.size()); }
+
+    const char* data() const { return _data.data(); }
+
+private:
+    std::vector<char> _data;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/blob_reader.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/blob_reader.hpp
new file mode 100644 (file)
index 0000000..aed52e2
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <utility>
+
+#include <ie_input_info.hpp>
+#include <ie_icnn_network.hpp>
+
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/model/data_desc.hpp>
+#include <vpu/graph_transformer.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+class BlobReader {
+public:
+    BlobReader() = default;
+
+    void parse(const std::vector<char>& blob);
+
+    const ie::InputsDataMap& getNetworkInputs() const { return _networkInputs; }
+    const ie::OutputsDataMap& getNetworkOutputs() const { return _networkOutputs; }
+
+    uint32_t getStageCount() const { return _blobHeader.stages_count; }
+
+    uint32_t getMagicNumber() const { return _blobHeader.magic_number; }
+
+    uint32_t getVersionMajor() const { return _blobHeader.blob_ver_major; }
+    uint32_t getVersionMinor() const { return _blobHeader.blob_ver_minor; }
+
+    const DataInfo& getInputInfo()  const { return _inputInfo; }
+    const DataInfo& getOutputInfo() const { return _outputInfo; }
+
+    std::pair<const char*, size_t> getHeader() const { return {_pBlob, sizeof(ElfN_Ehdr) + sizeof(mv_blob_header)};}
+
+private:
+    const char* _pBlob = nullptr;
+
+    mv_blob_header _blobHeader = {};
+
+    ie::InputsDataMap  _networkInputs;
+    ie::OutputsDataMap _networkOutputs;
+
+    DataInfo _inputInfo;
+    DataInfo _outputInfo;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/compile_env.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/compile_env.hpp
new file mode 100644 (file)
index 0000000..a614878
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/network_config.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/utils/logger.hpp>
+
+namespace vpu {
+
+struct CompileEnv final {
+    Platform platform = Platform::UNKNOWN;
+    Resources resources;
+
+    CompilationConfig config;
+    NetworkConfig netConfig;
+
+    Logger::Ptr log;
+
+    bool initialized = false;
+
+    static const CompileEnv& get();
+
+    static void init(
+            Platform platform,
+            const CompilationConfig& config,
+            const Logger::Ptr& log);
+    static void updateConfig(const CompilationConfig& config);
+    static void free();
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/custom_layer.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/custom_layer.hpp
new file mode 100644 (file)
index 0000000..6276f26
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <functional>
+
+#include <details/caseless.hpp>
+
+#include <pugixml.hpp>
+
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+VPU_DECLARE_ENUM(CustomDataFormat,
+    BYXF = 0,  // HWC used in most software layers
+    BFYX = 1,  // CHW used if HW module is enabled
+    Any  = 2,  // doesn't really matter
+    None = 3
+)
+
+VPU_DECLARE_ENUM(CustomParamType,
+    Input,
+    Output,
+    Data,
+    Int,
+    Float
+)
+
+class CustomLayer final {
+public:
+    using Ptr = std::shared_ptr<CustomLayer>;
+
+    struct KernelParam final {
+        CustomParamType type = CustomParamType::Input;
+        CustomDataFormat format = CustomDataFormat::Any;
+        std::string argName;
+        int portIndex = -1;
+        std::string irSource;
+    };
+
+    static ie::details::caseless_map<std::string, CustomLayer::Ptr> loadFromFile(
+                const std::string& configFile,
+                bool canBeMissed = false);
+
+    const std::string& kernelBinary() const { return _kernelBinary; }
+
+    int kernelAddress(int idx = 1) const;
+
+    const std::vector<KernelParam>& bindings() const { return _kernelParams; }
+    const std::vector<std::string>& parameters() const { return _parameters; }
+
+    const std::vector<std::string>& globalSizeRules() const { return _globalSizeRules; }
+    const std::vector<std::string>& localSizeRules() const { return _localSizeRules; }
+
+    int inputDimSourceIndex() { return _wgDimInputIdx; }
+
+private:
+    explicit CustomLayer(const std::string& dirname) : _configDir(dirname) {}
+
+    void loadSingleLayer(const pugi::xml_node& node);
+    void processKernelNode(const pugi::xml_node& node);
+    void processParametersNode(const pugi::xml_node& node);
+    void processWorkSizesNode(const pugi::xml_node& node);
+
+    static bool isLegalSizeRule(const std::string& rule);
+    static CustomDataFormat formatFromString(const std::string& str);
+
+private:
+    std::string _configDir;
+    std::string _layerName;
+    std::string _kernelEntry;
+    std::string _kernelBinary;
+
+    std::vector<KernelParam> _kernelParams;
+    std::vector<std::string> _globalSizeRules;
+    std::vector<std::string> _localSizeRules;
+    std::vector<std::string> _parameters;
+
+    std::map<uint32_t, uint32_t, std::greater<uint32_t>> _kernelAddress;
+
+    int _wgDimInputIdx = 0;
+};
+
+};  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/frontend.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/frontend.hpp
new file mode 100644 (file)
index 0000000..74ece88
--- /dev/null
@@ -0,0 +1,162 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <tuple>
+#include <set>
+
+#include <cpp/ie_cnn_network.h>
+#include <details/caseless.hpp>
+
+#include <vpu/frontend/stage_builder.hpp>
+#include <vpu/frontend/parse_network.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/custom_layer.hpp>
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+
+class FrontEnd final : public std::enable_shared_from_this<FrontEnd> {
+//
+// Public API
+//
+
+public:
+    using Ptr = std::shared_ptr<FrontEnd>;
+
+    explicit FrontEnd(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    Model::Ptr buildInitialModel(const ie::ICNNNetwork& network);
+
+    std::set<std::string> checkSupportedLayers(const ie::ICNNNetwork& network);
+
+    const std::vector<ie::CNNLayerPtr>& allLayers() const { return _ieNetworkParser.orderedLayers; }
+
+//
+// Passes
+//
+
+private:
+    Model::Ptr runCommonPasses(
+            const ie::ICNNNetwork& network,
+            LayersOrder order);
+
+    ie::CNNNetwork detectNetworkBatch(
+            const ie::ICNNNetwork& network,
+            const Model::Ptr& model);
+
+    void RemoveConstLayers(ie::ICNNNetwork& network);
+
+    void parseInputAndOutputData(const Model::Ptr& model);
+    void addDataTypeConvertStages(const Model::Ptr& model);
+    void addPreProcessStages(const Model::Ptr& model);
+
+    void eliminatePriorBoxData(const Model::Ptr& model);
+
+//
+// IR Parsers
+//
+
+public:
+    //
+    // Layers, that might be both SW and HW
+    //
+
+    void parseConvolution(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePooling(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseFullyConnected(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+
+    //
+    // SW only layers
+    //
+
+    void parseReLU(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseSoftMax(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseGRN(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseMVN(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseNorm(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePower(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseScale(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePermute(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseDetectionOutput(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseEltwise(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseSigmoid(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseTanH(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePReLU(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseBatchNorm(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseDeconvolution(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseCopy(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseELU(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseCrop(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseTile(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseNormalize(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseRegionYolo(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseReorgYolo(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseBias(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseCTCDecoder(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseInterp(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseClamp(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseProposal(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseROIPooling(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePSROIPooling(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseCustom(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseMTCNN(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseLSTMCell(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePad(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseResample(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseArgMax(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseRNN(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+
+    //
+    // Special layers
+    //
+
+    void parsePriorBox(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parsePriorBoxClustered(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseReshape(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseConcat(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+    void parseSplit(const Model::Ptr& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs);
+
+//
+// Utility
+//
+
+private:
+    Data getVpuData(const ie::DataPtr& ieData);
+    void bindData(const Data& data, const ie::DataPtr& ieData);
+
+    void getInputAndOutputData(
+            const Model::Ptr& model,
+            const ie::CNNLayerPtr& layer,
+            DataVector& inputs,
+            DataVector& outputs);
+
+    std::tuple<Data, Data> getWeightsAndBiases(
+            const Model::Ptr& model,
+            const ie::CNNLayerPtr& layer);
+
+//
+// Internal state
+//
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+
+    std::unordered_set<ie::DataPtr> _unbatchedOutputs;
+    std::unordered_map<ie::DataPtr, Data> _ieToVpuMap;
+
+    ie::details::caseless_map<std::string, CustomLayer::Ptr> _customLayers;
+    vpu::IeNetworkParser _ieNetworkParser;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/parse_network.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/parse_network.hpp
new file mode 100644 (file)
index 0000000..127f887
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <tuple>
+#include <set>
+
+#include <cpp/ie_cnn_network.h>
+#include <details/caseless.hpp>
+
+#include <vpu/frontend/stage_builder.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/custom_layer.hpp>
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+VPU_DECLARE_ENUM(LayersOrder,
+    DFS,
+    BFS)
+
+class IeNetworkParser final {
+//
+// Public API
+//
+public:
+    void clear();
+    void checkNetwork(const ie::CNNNetwork& network);
+
+    void parseNetworkBFS(const ie::CNNNetwork& network);
+    void parseNetworkDFS(const ie::CNNNetwork& network);
+
+    ie::InputsDataMap networkInputs;
+    ie::OutputsDataMap networkOutputs;
+    std::unordered_map<ie::DataPtr, ie::Blob::Ptr> constDatas;
+    std::vector<ie::CNNLayerPtr> orderedLayers;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/stage_builder.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/stage_builder.hpp
new file mode 100644 (file)
index 0000000..4240bd6
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <ie_layers.h>
+
+#include <vpu/model/model.hpp>
+
+namespace vpu {
+
+class StageBuilder final : public std::enable_shared_from_this<StageBuilder> {
+public:
+    using Ptr = std::shared_ptr<StageBuilder>;
+
+    Stage createConvertStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const Data& input,
+            const Data& output,
+            StageType type,
+            float scale = 1.0f,
+            float bias = 0.0f);
+
+    Stage addSumStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input0,
+            const Data& input1,
+            const Data& output);
+
+    Stage addBiasStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& biases,
+            const Data& output);
+
+    Stage addScaleStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& scales,
+            const Data& output);
+
+    Stage addCopyStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output);
+
+    Stage addPadStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            PadMode padMode,
+            float pad_value,
+            const DimValues& pads_begin,
+            const DimValues& pads_end,
+            const Data& input,
+            const Data& output);
+
+    Stage addNoneStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const DataVector& inputs,
+            const DataVector& outputs);
+
+    Stage addPowerStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            float scale,
+            float power,
+            float bias,
+            const Data& input,
+            const Data& output);
+
+    Stage addReLUStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            float negativeSlope,
+            const Data& input,
+            const Data& output,
+            const Data& biases = nullptr);
+
+    Stage addReshapeStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output);
+
+    Stage addConcatStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            Dim axis,
+            const DataVector& inputs,
+            const Data& output);
+
+    Stage addConcatStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const std::vector<DimValues>& offsets,
+            const DataVector& inputs,
+            const Data& output);
+
+    Stage addSplitStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            Dim axis,
+            const Data& input,
+            const DataVector& outputs);
+
+    Stage addSplitStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const std::vector<DimValues>& offsets,
+            const Data& input,
+            const DataVector& outputs);
+
+    Stage addScalingStage(
+            const Model::Ptr& model,
+            const ie::CNNLayerPtr& origLayer,
+            float scale,
+            const Data& input,
+            const Data& output);
+
+    Stage addSwFullyConnectedStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& weights,
+            const Data& biases,
+            Data output);
+
+    Stage addExpandStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output,
+            const DimValues& offset = DimValues());
+
+    Stage addShrinkStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output,
+            const DimValues& offset = DimValues());
+
+    Stage addSoftMaxStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output,
+            Dim axis);
+
+    Stage addClampStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            float min,
+            float max,
+            const Data& input,
+            const Data& output);
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
new file mode 100644 (file)
index 0000000..a9338bf
--- /dev/null
@@ -0,0 +1,167 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include <set>
+#include <utility>
+
+#include <ie_icnn_network.hpp>
+
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/perf_report.hpp>
+#include <vpu/utils/logger.hpp>
+#include <vpu/utils/optional.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// CompilationConfig
+//
+
+VPU_DECLARE_ENUM(Platform,
+    UNKNOWN = 0,
+    MYRIAD_2 = 2450,
+    MYRIAD_X = 2480
+)
+
+// Must be synchronized with MvTensor
+VPU_DECLARE_ENUM(ExecutionMode,
+    AUTO = -1,
+    SINGLE = 0,
+    PARALLEL = 1
+)
+
+VPU_DECLARE_ENUM(ComputeLayout,
+    AUTO,
+    NCHW,
+    NHWC
+)
+
+struct CompilationConfig final {
+    //
+    // Main flags
+    //
+
+    int numSHAVEs = -1;
+    int numCMXSlices = -1;
+
+    bool hwOptimization = true;
+
+    bool hwAdaptiveMode = true;
+
+    bool ignoreIRStatistic = false;
+
+    std::string networkConfig;
+
+    std::string customLayers;
+
+    //
+    // Debug flags
+    //
+
+    ComputeLayout forceLayout = ComputeLayout::AUTO;
+
+    bool detectBatch = true;
+
+    bool allowFP32Models = false;
+
+    std::string hwWhiteList;
+    std::string hwBlackList;
+
+    std::string noneLayers;
+
+    bool ignoreUnknownLayers = false;
+
+    Optional<bool> copyOptimization;
+    Optional<bool> injectSwOps;
+    Optional<bool> packDataInCmx;
+
+    bool mergeHwPoolToConv = true;
+
+    int numberOfNodesInOneSubGraph = 1;
+
+    //
+    // Deprecated flags
+    //
+
+    float inputScale = 1.0f;
+    float inputBias = 0.0f;
+};
+
+
+//
+// DataInfo
+//
+
+struct DataInfo final {
+    std::unordered_map<std::string, int> offset;
+    int totalSize = 0;
+};
+
+//
+// CompiledGraph
+//
+
+struct CompiledGraph final {
+    using Ptr = std::shared_ptr<CompiledGraph>;
+
+    std::vector<char> blob;
+    std::pair<char*, size_t> blobHeader;
+
+    std::string networkName;
+
+    int networkBatch = 0;
+
+    std::vector<StageMetaInfo> stagesMeta;
+    int numActiveStages = 0;
+
+    DataInfo inputInfo;
+    DataInfo outputInfo;
+
+    int inputBufSize = 0;
+    int outputBufSize = 0;
+};
+
+//
+// compileNetwork
+//
+
+CompiledGraph::Ptr compileNetwork(
+        const ie::ICNNNetwork& network,
+        Platform platform,
+        const CompilationConfig& config,
+        const Logger::Ptr& log);
+
+CompiledGraph::Ptr compileSubNetwork(
+        const ie::ICNNNetwork& network,
+        const CompilationConfig& subConfig);
+
+//
+// getSupportedLayers
+//
+
+std::set<std::string> getSupportedLayers(
+        const ie::ICNNNetwork& network,
+        Platform platform,
+        const CompilationConfig& config,
+        const Logger::Ptr& log);
+
+//
+// Blob version and checks
+//
+
+const uint32_t BLOB_MAGIC_NUMBER  = 9709;
+const uint32_t BLOB_VERSION_MAJOR = 4;
+const uint32_t BLOB_VERSION_MINOR = 0;
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/hw/mx_stage.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/hw/mx_stage.hpp
new file mode 100644 (file)
index 0000000..e5c12df
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/stage.hpp>
+
+namespace vpu {
+
+class MyriadXHwStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override;
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override;
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override;
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override;
+
+    void finalizeDataLayoutImpl() override;
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override;
+
+    void finalCheckImpl() const override;
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override;
+
+    void serializeDataImpl(BlobSerializer& serializer) const override;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/hw/tiling.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/hw/tiling.hpp
new file mode 100644 (file)
index 0000000..3f5d978
--- /dev/null
@@ -0,0 +1,244 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <array>
+#include <limits>
+
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/hw/utility.hpp>
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/dot_io.hpp>
+
+namespace vpu {
+
+//
+// Common constants
+//
+
+const HwDataMode CNN_DATA_TYPE = HwDataMode::FP16;
+const HwCoeffMode CNN_COEFF_TYPE = HwCoeffMode::FP16;
+
+const std::array<int, 2> CNN_COEFF_PER_WORD_VALUES{1, 2};
+const std::array<int, 2> CNN_BYTES_PER_PIXEL{2, 1};
+
+const std::array<HwOpMode, 5> CNN_MODES{HwOpMode::MODE_1_256, HwOpMode::MODE_2_128, HwOpMode::MODE_4_64, HwOpMode::MODE_8_32, HwOpMode::MODE_16_16};
+const std::array<int, 5> CNN_MODES_COST{0, 5, 11, 19, 31};
+
+const int CNN_MAX_INPUT_WIDTH = 4096;
+const int CNN_MAX_INPUT_HEIGHT = 4096;
+const int CNN_MAX_INPUT_CHANNELS = 2048;
+const int CNN_MAX_OUTPUT_CHANNELS = 2048;
+
+const int CNN_MAX_BYTES = 128 * 1024;
+const int CNN_MAX_CHANNELS_PER_BLOCK = 2048;
+const int CNN_MAX_COEFF_PER_BLOCK = 256;
+
+//
+// Tiling scheme
+//
+
+struct HwPlaneTileInfo final {
+    int inputWithJunk = 0, outputWithJunk = 0;
+    int outputJunkBefore = 0, outputJunkAfter = 0;
+    int inputStartIndex = 0, inputEndIndex = 0;
+    int outputStartIndex = 0, outputEndIndex = 0;
+};
+
+template <class Tiles> struct HwChannelTile;
+template <class Tiles> using HwChannelTilePtr = std::shared_ptr<HwChannelTile<Tiles>>;
+
+template <class Tiles> struct HwPlaneTile;
+template <class Tiles> using HwPlaneTilePtr = std::shared_ptr<HwPlaneTile<Tiles>>;
+template <class Tiles> using HwPlaneTileWeakPtr = std::weak_ptr<HwPlaneTile<Tiles>>;
+
+template <class Tiles> struct HwTiling;
+template <class Tiles> using HwTilingPtr = std::shared_ptr<HwTiling<Tiles>>;
+template <class Tiles> using HwTilingWeakPtr = std::weak_ptr<HwTiling<Tiles>>;
+
+template <class Tiles>
+struct HwChannelTile final {
+    HwPlaneTileWeakPtr<Tiles> parent;
+
+    int socInd = 0;
+
+    int channelStartIndex = 0;
+    int numInputChannels = 0;
+
+    int extendedInputDimC = 0;
+    int extendedOutputDimC = 0;
+
+    Tiles finalTiles;
+};
+
+template <class Tiles>
+struct HwPlaneTile final {
+    HwTilingWeakPtr<Tiles> parent;
+
+    int sohInd = 0;
+    int sowInd = 0;
+
+    HwPlaneTileInfo heightInfo = {};
+    HwPlaneTileInfo widthInfo = {};
+
+    std::vector<HwChannelTilePtr<Tiles>> channelTiles;
+};
+
+template <class Tiles>
+struct HwTiling final {
+    int sohTiles = 0;
+    int sowTiles = 0;
+    int socTiles = 0;
+
+    std::vector<HwPlaneTilePtr<Tiles>> planeTiles;
+};
+
+template <class Tiles>
+void printTo(std::ostream& os, const HwTilingPtr<Tiles>& tiling) {
+    os << "[" << std::endl;
+    os << "sohTiles=" << tiling->sohTiles << std::endl;
+    os << "sowTiles=" << tiling->sowTiles << std::endl;
+    os << "socTiles=" << tiling->socTiles << std::endl;
+    os << "]";
+}
+
+template <class Tiles>
+void printTo(DotLabel& lbl, const HwTilingPtr<Tiles>& tiling) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("sohTiles", tiling->sohTiles);
+    subLbl.appendPair("sowTiles", tiling->sowTiles);
+    subLbl.appendPair("socTiles", tiling->socTiles);
+}
+
+template <class Tiles>
+std::string getChannelTilePostfix(const HwChannelTilePtr<Tiles>& channelTile) {
+    auto planeTile = channelTile->parent.lock();
+    IE_ASSERT(planeTile != nullptr);
+
+    auto tiling = planeTile->parent.lock();
+    IE_ASSERT(tiling != nullptr);
+
+    std::ostringstream ostr;
+
+    if (tiling->socTiles > 1)
+        ostr << "@soc=" << channelTile->socInd + 1 << "/" << tiling->socTiles;
+
+    return ostr.str();
+}
+
+template <class Tiles>
+std::string getPlaneTilePostfix(const HwPlaneTilePtr<Tiles>& planeTile) {
+    auto tiling = planeTile->parent.lock();
+    IE_ASSERT(tiling != nullptr);
+
+    std::ostringstream ostr;
+
+    if (tiling->sohTiles > 1)
+        ostr << "@soh=" << planeTile->sohInd + 1 << "/" << tiling->sohTiles;
+    if (tiling->sowTiles > 1)
+        ostr << "@sow=" << planeTile->sowInd + 1 << "/" << tiling->sowTiles;
+
+    return ostr.str();
+}
+
+struct HwConvTileInfo final {
+    HwOpMode mode = HwOpMode::MODE_1_256;
+    int numDescr = 0;
+    int outChansPerDescr = 0;
+    int lastOutChans = 0;
+    int extendedInputDimC = 0;
+    int extendedOutputDimC = 0;
+    double cost = std::numeric_limits<double>::max();
+};
+
+void printTo(std::ostream& os, const HwConvTileInfo& convTiles);
+void printTo(DotLabel& lbl, const HwConvTileInfo& convTiles);
+
+using HwConvChannelTile = HwChannelTile<HwConvTileInfo>;
+using HwConvChannelTilePtr = HwChannelTilePtr<HwConvTileInfo>;
+using HwConvPlaneTile = HwPlaneTile<HwConvTileInfo>;
+using HwConvPlaneTilePtr = HwPlaneTilePtr<HwConvTileInfo>;
+using HwConvTiling = HwTiling<HwConvTileInfo>;
+using HwConvTilingPtr = HwTilingPtr<HwConvTileInfo>;
+
+struct HwPoolTileInfo final {
+    HwOpMode mode = HwOpMode::MODE_1_256;
+    int numDescr = 0;
+    int chansPerDescr = 0;
+};
+
+void printTo(std::ostream& os, const HwPoolTileInfo& poolTiles);
+void printTo(DotLabel& lbl, const HwPoolTileInfo& poolTiles);
+
+using HwPoolChannelTile = HwChannelTile<HwPoolTileInfo>;
+using HwPoolChannelTilePtr = HwChannelTilePtr<HwPoolTileInfo>;
+using HwPoolPlaneTile = HwPlaneTile<HwPoolTileInfo>;
+using HwPoolPlaneTilePtr = HwPlaneTilePtr<HwPoolTileInfo>;
+using HwPoolTiling = HwTiling<HwPoolTileInfo>;
+using HwPoolTilingPtr = HwTilingPtr<HwPoolTileInfo>;
+
+struct HwFullyConnectedTileInfo final {
+    HwOpMode mode = HwOpMode::MODE_1_256;
+    int numOutTiles = 0;
+    int numInSubTiles = 0;
+    int workInN = 0;
+    int workOutN = 0;
+};
+
+void printTo(std::ostream& os, const HwFullyConnectedTileInfo& fcTiles);
+void printTo(DotLabel& lbl, const HwFullyConnectedTileInfo& fcTiles);
+
+using HwFullyConnectedChannelTile = HwChannelTile<HwFullyConnectedTileInfo>;
+using HwFullyConnectedChannelTilePtr = HwChannelTilePtr<HwFullyConnectedTileInfo>;
+using HwFullyConnectedPlaneTile = HwPlaneTile<HwFullyConnectedTileInfo>;
+using HwFullyConnectedPlaneTilePtr = HwPlaneTilePtr<HwFullyConnectedTileInfo>;
+using HwFullyConnectedTiling = HwTiling<HwFullyConnectedTileInfo>;
+using HwFullyConnectedTilingPtr = HwTilingPtr<HwFullyConnectedTileInfo>;
+
+//
+// Input<->Output tile calculation
+//
+
+int calcOutputSize(
+        int inputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        bool useCeil);
+
+//
+// Plane tiles calculation.
+//
+
+std::vector<HwPlaneTileInfo> splitIntoPlaneTilesWithPool(
+        int inputSize,
+        int kernelSize, int kernelStride,
+        int pad,
+        int maxOutputSize);
+
+std::vector<HwPlaneTileInfo> splitIntoPlaneTiles(
+        int inputSize, int outputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        int maxOutputSize,
+        bool alignInputTile,
+        bool useCeil);
+
+//
+// HW Convolution tiling over output channels.
+//
+
+// This function tries to split the output over channels.
+HwConvTileInfo splitHwConvIntoOutChannelsTiles(
+        int inTileWidth, int inTileHeight, int inTileChannels,
+        int outTileChannels,
+        int kernelSizeX, int kernelSizeY,
+        int kernelStride);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/hw/utility.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/hw/utility.hpp
new file mode 100644 (file)
index 0000000..a651efb
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include <vpu/model/data.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+//
+// HW Operation parameters
+//
+
+VPU_DECLARE_ENUM(HwOpType,
+    CONV = 0,
+    CONV_POOL = 1,
+    FC = 2,
+    POOL = 4,
+)
+
+VPU_DECLARE_ENUM(HwPoolType,
+    MAX = 0,
+    AVERAGE = 1,
+);
+
+VPU_DECLARE_ENUM(HwOpMode,
+    MODE_1_256 = 0,
+    MODE_2_128 = 1,
+    MODE_4_64 = 2,
+    MODE_8_32 = 3,
+    MODE_16_16 = 4,
+);
+
+VPU_DECLARE_ENUM(HwPadMode,
+    PAD_WITH_ZEROS = 0x00,
+    PAD_REPEAT_RIGHT_EDGE = 0x01,
+    PAD_REPEAT_LEFT_EDGE = 0x08,
+    PAD_REPEAT_TOP_EDGE = 0x04,
+    PAD_REPEAT_BOTTOM_EDGE = 0x02,
+);
+
+inline HwPadMode operator|(HwPadMode m1, HwPadMode m2) {
+    return static_cast<HwPadMode>(static_cast<int32_t>(m1) | static_cast<int32_t>(m2));
+}
+
+VPU_DECLARE_ENUM(HwCoeffMode,
+    FP16 = 0,
+    U8F = 1,
+);
+
+VPU_DECLARE_ENUM(HwDataMode,
+    FP16 = 0,
+    U8F = 1,
+);
+
+struct HwOpParams final {
+    HwOpType opType = HwOpType::CONV;
+    HwOpMode opMode = HwOpMode::MODE_1_256;
+
+    HwPoolType poolType = HwPoolType::MAX;
+
+    bool withPad = false;
+    HwPadMode padMode = HwPadMode::PAD_WITH_ZEROS;
+
+    int32_t inputInd = -1;
+    int32_t outputInd = -1;
+    int32_t coeffsInd = -1;
+    int32_t biasesInd = -1;
+    int32_t scalesInd = -1;
+
+    uint32_t outChanOffset = 0;
+    uint32_t outNumChans = 0;
+
+    uint32_t fcInputOffset = 0;
+    uint32_t fcInputNum = 0;
+    uint32_t fcOutputOffset = 0;
+    uint32_t fcOutputNum = 0;
+    bool fcAccum = false;
+
+    uint32_t kernelWidth = 0;
+    uint32_t kernelHeight = 0;
+    uint32_t kernelStride = 0;
+
+    uint32_t poolKernelWidth = 0;
+    uint32_t poolKernelHeight = 0;
+
+    bool withReLU = false;
+    uint32_t t0 = 0;
+    uint32_t a0 = 0;
+    uint32_t a1 = 0;
+
+    bool withClamp = false;
+    float clampMaxVal = 0;
+
+    bool reuseData = false;
+    bool reuseCoeff = false;
+};
+
+struct HwOpList final {
+    std::vector<HwOpParams> vec;
+};
+
+void printTo(std::ostream& os, const HwOpList& hwOps);
+void printTo(DotLabel& lbl, const HwOpList& hwOps);
+
+//
+// HwPaddingInfo
+//
+
+struct HwPaddingInfo final {
+    bool enable = false;
+    int left = 0;
+    int right = 0;
+    int top = 0;
+    int bottom = 0;
+};
+
+HwPaddingInfo getHwPaddingInfo(
+        const DimValues& inDims, const DimValues& outDims,
+        int kernelDimX, int kernelDimY,
+        int kernelStrideX, int kernelStrideY);
+
+void printTo(std::ostream& os, const HwPaddingInfo& hwPad);
+void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad);
+
+
+//
+// HwWeightsContent
+//
+
+class HwWeightsContent final : public CalculatedDataContent {
+public:
+    HwWeightsContent(
+            const DataContent::Ptr& origContent,
+            const DataDesc& origWeightsDesc,
+            int numInputChannels,
+            int channelStartIndex = 0);
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
+
+private:
+    DataDesc _origWeightsDesc;
+    int _numInputChannels = 0;
+    int _channelStartIndex = 0;
+};
+
+//
+// calculateHwBufferSize
+//
+
+int calculateHwBufferSize(const DimValues& dims, DimsOrder order = DimsOrder());
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/base.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/base.hpp
new file mode 100644 (file)
index 0000000..b0387cf
--- /dev/null
@@ -0,0 +1,93 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <list>
+#include <queue>
+#include <stack>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/handle.hpp>
+#include <vpu/utils/attributes_map.hpp>
+#include <vpu/utils/range.hpp>
+
+namespace vpu {
+
+//
+// Utility macros
+//
+
+#define VPU_DEFINE_MODEL_TYPES(type, postfix)                                                       \
+    using type = Handle<VPU_COMBINE(type, postfix)>;                                                \
+    \
+    using VPU_COMBINE(type, Vector) = std::vector<type>;                                            \
+    \
+    using VPU_COMBINE(type, List) = IntrusivePtrList<VPU_COMBINE(type, postfix)>;                   \
+    \
+    using VPU_COMBINE(type, Set) = std::unordered_set<type, HandleHash>;                            \
+    \
+    template <typename Val>                                                                         \
+    using VPU_COMBINE(type, Map) = std::unordered_map<type, Val, HandleHash>;                       \
+    \
+    using VPU_COMBINE(type, Ptr) = std::shared_ptr<VPU_COMBINE(type, postfix)>;                     \
+    \
+    using VPU_COMBINE(type, PtrList) = std::list<VPU_COMBINE(type, Ptr)>;
+
+//
+// VPU_MODEL_ATTRIBUTE
+//
+
+#define VPU_MODEL_ATTRIBUTE(type, name, defVal)                                 \
+    protected:                                                                  \
+        type VPU_COMBINE(_, name) = defVal;                                     \
+    public:                                                                     \
+        inline const type& name() const {                                       \
+            return VPU_COMBINE(_, name);                                        \
+        }
+
+#define VPU_MODEL_ATTRIBUTE_PTR_RANGE(type, name)                               \
+    protected:                                                                  \
+        type VPU_COMBINE(_, name);                                              \
+    public:                                                                     \
+        inline auto name() const -> decltype(contRange(VPU_COMBINE(_, name))) { \
+            return contRange(VPU_COMBINE(_, name));                             \
+        }
+
+//
+// Forward declaration
+//
+
+class GraphTransformerImpl;
+
+class Model;
+using ModelPtr = std::shared_ptr<Model>;
+
+class DataNode;
+VPU_DEFINE_MODEL_TYPES(Data, Node)
+
+class StageNode;
+VPU_DEFINE_MODEL_TYPES(Stage, Node)
+
+class StageInputEdge;
+VPU_DEFINE_MODEL_TYPES(StageInput, Edge)
+
+class StageOutputEdge;
+VPU_DEFINE_MODEL_TYPES(StageOutput, Edge)
+
+class StageTempBufferEdge;
+VPU_DEFINE_MODEL_TYPES(StageTempBuffer, Edge)
+
+class SharedAllocationEdge;
+VPU_DEFINE_MODEL_TYPES(SharedAllocation, Edge)
+
+class InjectedStageEdge;
+VPU_DEFINE_MODEL_TYPES(InjectedStage, Edge)
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data.hpp
new file mode 100644 (file)
index 0000000..36709e5
--- /dev/null
@@ -0,0 +1,354 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <functional>
+#include <vector>
+
+#include <ie_data.h>
+#include <ie_blob.h>
+
+#include <vpu/model/base.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data_desc.hpp>
+#include <vpu/backend/blob_serializer.hpp>
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/func_ref.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// DataUsage
+//
+
+//
+// Describes how Data object is used in the Model:
+//   * Input / Output : network input or output.
+//   * Const : constant values (weights, biases, etc.).
+//   * Intermediate : Data that are used for intermediate results.
+//   * Temp : temporary buffer.
+//   * Fake : fake Data object to fill stage input/output port.
+//
+
+VPU_DECLARE_ENUM(DataUsage,
+    Input,
+    Output,
+    Const,
+    Intermediate,
+    Temp,
+    Fake
+)
+
+//
+// DataLocation
+//
+
+//
+// Describes where Data object is located.
+//
+
+// Must be synchronized with MvTensor
+VPU_DECLARE_ENUM(DataLocation,
+    None = 0,
+    Input = 1,
+    Output = 2,
+    Blob = 3,
+    BSS = 4,
+    CMX = 5
+)
+
+VPU_DECLARE_ENUM(MemoryType,
+    DDR,
+    CMX)
+
+//
+// DataContent
+//
+
+//
+// Content of the Const Data object.
+//
+
+class DataContent {
+public:
+    using Ptr = std::shared_ptr<DataContent>;
+
+    virtual ~DataContent() = default;
+
+    // TYPED pointer
+    template <typename T>
+    const T* get() const {
+        return static_cast<const T*>(getRaw());
+    }
+
+    const DataDesc& desc() const { return _desc; }
+
+protected:
+    // RAW pointer
+    virtual const void* getRaw() const = 0;
+
+protected:
+    DataDesc _desc;
+    friend class Model;
+};
+
+//
+// Data content that is calculated on the fly, using lazy calculation:
+//
+//   * It performs calculation on the first call and stores it in internal buffer.
+//   * Next access will return the pointer to calculated buffer.
+//
+class CalculatedDataContent : public DataContent {
+public:
+    CalculatedDataContent() = default;
+    explicit CalculatedDataContent(std::initializer_list<DataContent::Ptr> baseContents) : _baseContents(baseContents) {}
+
+protected:
+    const void* getRaw() const override;
+
+    virtual size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const;
+    virtual void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const = 0;
+
+private:
+    mutable SmallVector<DataContent::Ptr, 2> _baseContents;
+    mutable std::vector<uint8_t> _temp;
+};
+
+DataContent::Ptr ieBlobContent(
+        const ie::Blob::Ptr& blob,
+        int repeat = 1);
+
+DataContent::Ptr replicateContent(
+        float val,
+        int count);
+
+DataContent::Ptr replicateContent(
+        const DataContent::Ptr& origContent,
+        int count);
+
+DataContent::Ptr scaleContent(
+        const DataContent::Ptr& origContent,
+        float scale);
+
+//
+// DataNode
+//
+
+class DataNode final :
+        public EnableHandleFromThis<DataNode>,
+        public EnableCustomAttributes {
+    //
+    // Main attributes
+    //
+
+    VPU_MODEL_ATTRIBUTE(std::string, name, std::string())
+    VPU_MODEL_ATTRIBUTE(DataUsage, usage, DataUsage::Fake)
+    VPU_MODEL_ATTRIBUTE(DataDesc, desc, DataDesc())
+    VPU_MODEL_ATTRIBUTE(StridesRequirement, requiredStrides, StridesRequirement::empty())
+
+    //
+    // Bindings with IE
+    //
+
+    VPU_MODEL_ATTRIBUTE(ie::DataPtr, origData, nullptr)
+
+    //
+    // Edges
+    //
+
+    VPU_MODEL_ATTRIBUTE(StageOutput, producerEdge, nullptr)
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(StageInputList, consumerEdges)
+
+    VPU_MODEL_ATTRIBUTE(StageTempBuffer, tempBufferEdge, nullptr)
+
+    /**
+     * Parent data edge actually allocates memory
+     */
+    VPU_MODEL_ATTRIBUTE(SharedAllocation, parentDataEdge, nullptr)
+
+    /**
+     * Children data edges uses parent's memory
+     */
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(SharedAllocationList, childDataEdges)
+
+    //
+    // Const data content
+    //
+
+    VPU_MODEL_ATTRIBUTE(DataContent::Ptr, content, nullptr)
+
+    //
+    // Allocation info
+    //
+
+    VPU_MODEL_ATTRIBUTE(MemoryType, memReqs, MemoryType::DDR)
+    VPU_MODEL_ATTRIBUTE(DataLocation, location, DataLocation::None)
+    VPU_MODEL_ATTRIBUTE(int, memoryOffset, 0)
+
+    //
+    // Edges wrappers
+    //
+
+private:
+    struct ConsumerAccess final {
+        inline auto operator()(const StageInput& edge) const -> decltype(edge->consumer()) {
+            return edge->consumer();
+        }
+    };
+
+    struct ChildDataAccess final {
+        inline auto operator()(const SharedAllocation& edge) const -> decltype(edge->child()) {
+            return edge->child();
+        }
+    };
+
+public:
+    inline Stage producer() const {
+        return _producerEdge == nullptr ? nullptr : _producerEdge->producer();
+    }
+
+    inline int numConsumers() const {
+        return _consumerEdges.size();
+    }
+    inline auto consumers() const -> decltype(mapRange<ConsumerAccess>(consumerEdges())) {
+        return mapRange<ConsumerAccess>(consumerEdges());
+    }
+    inline StageInput singleConsumerEdge() const {
+        IE_ASSERT(_consumerEdges.size() == 1);
+        return *_consumerEdges.begin();
+    }
+    inline Stage singleConsumer() const {
+        return singleConsumerEdge()->consumer();
+    }
+
+    inline Data parentData() const {
+        return _parentDataEdge == nullptr ? nullptr : _parentDataEdge->parent();
+    }
+
+    inline int numChildDatas() const {
+        return _childDataEdges.size();
+    }
+    inline auto childDatas() const -> decltype(mapRange<ChildDataAccess>(childDataEdges())) {
+        return mapRange<ChildDataAccess>(childDataEdges());
+    }
+
+    Data getTopParentData() const;
+
+    //
+    // DataDesc
+    //
+
+    DimValues strides() const;
+
+    int totalByteSize() const;
+
+    int elemOffset(const DimValues& coord) const;
+    int lastElemOffset() const;
+
+    //
+    // Bindings with IE
+    //
+
+    inline void setOrigData(const ie::DataPtr& origData) { _origData = origData; }
+
+    //
+    // StridesRequirement
+    //
+
+    bool checkStrides(const StridesRequirement& reqs) const;
+
+    inline void resetRequiredStrides() { _requiredStrides = StridesRequirement::empty(); }
+    void updateRequiredStrides(const StridesRequirement& newReqs);
+
+    //
+    // Allocation info
+    //
+
+    void clearAllocation();
+
+    void setMemReqs(MemoryType mem);
+
+    void setIOInfo(DataLocation location, int ioBufferOffset);
+
+    void setAllocationInfo(DataLocation location, int memoryOffset);
+
+    //
+    // Backend utilities
+    //
+
+    // Serialize as-is for new MvTensor kernels that can work with ND data.
+    // If `newOrder` is not empty, it will be used instead of original and missing dimensions will be set to 1.
+    void serializeNewBuffer(
+            BlobSerializer& serializer,
+            DimsOrder newOrder = DimsOrder());
+
+    // Serialize for deprecated MvTensor kernels that can work only with 3D data.
+    //
+    // `dimsReloc` is a map from new dims to original dims.
+    // Empty record means use 1 for the new dim and reuse previous stride.
+    // For example :
+    //   * Original order : NC
+    //   * `newOrder` : HWC
+    //   * `dimsReloc` : {(C -> C), {H -> N}}
+    // The Data will be serialized as HWC with
+    //   * newDims[H] == origDims[N]
+    //   * newDims[W] == 1
+    //   * newDims[C] == origDims[C]
+    // If there is several original dims per new dim, they will be multiplied
+    // (assuming that original dims are near and have no strides between).
+    void serializeOldBuffer(
+            const Stage& stage,
+            BlobSerializer& serializer,
+            DimsOrder newOrder = DimsOrder(),
+            const EnumMap<Dim, std::vector<Dim>>& dimsReloc = EnumMap<Dim, std::vector<Dim>>());
+
+    void serializeIOInfo(BlobSerializer& serializer) const;
+
+private:
+    void serializeDescImpl(
+            BlobSerializer& serializer,
+            const DataDesc& storedDesc,
+            const DimValues& storedStrides) const;
+
+    void serializeBufferImpl(
+            BlobSerializer& serializer,
+            const DataDesc& storedDesc,
+            const DimValues& storedStrides) const;
+
+private:
+    inline DataNode() :
+        _consumerEdges(&StageInputEdge::_posInData),
+        _childDataEdges(&SharedAllocationEdge::_posInData),
+        _posInModel(this) {
+    }
+
+private:
+    Handle<Model> _model;
+    DataPtrList::iterator _ptrPosInModel;
+    IntrusivePtrListNode<DataNode> _posInModel;
+
+    friend class Model;
+};
+
+void printTo(std::ostream& os, const Data& data);
+
+//
+// loopOverData
+//
+
+VPU_DECLARE_ENUM(DataLoopStatus,
+    NextChild,
+    NextSibling,
+    Stop)
+
+void loopOverData(
+        const Data& data,
+        const FuncRef<DataLoopStatus(const Data&)>& op);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_desc.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_desc.hpp
new file mode 100644 (file)
index 0000000..3f82e8a
--- /dev/null
@@ -0,0 +1,610 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <limits>
+#include <map>
+#include <type_traits>
+#include <functional>
+#include <utility>
+
+#include <ie_layouts.h>
+
+#include <vpu/model/base.hpp>
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/checked_cast.hpp>
+
+//
+// Description (type, layout, dimensions, strides) for Data objects inside the VPU Model.
+//
+// The VPU Model uses own represenatation of Data layout and dimensions.
+// The dimensions are stored in a special container in memory-independent order.
+// Each dimension has unique name, which can be represented as an index (eg. `width` : 0, `height` : 1, etc.).
+// The DimsOrder parameter provides information about actual layout in the memory.
+// During the Fathom Blob serialization VPU Graph Transformer will convert the dimensions from
+// memory-independant order to memory order from minor to major dimension.
+//
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// DataType
+//
+
+// Must be synchronized with MvTensor
+VPU_DECLARE_ENUM(DataType,
+    FP16 = 0,
+    U8 = 1,
+//     S32 = 2,  // TODO: remove from MvTensor
+    FP32 = 3
+)
+
+//
+// Dim
+//
+
+//
+// Named dimensions for better readability.
+//
+
+VPU_DECLARE_ENUM(Dim,
+    Invalid = -1,
+    W = 0,
+    H = 1,
+    C = 2,
+    N = 3,
+    _5 = 4,
+    _6 = 5,
+    _7 = 6,
+    _8 = 7
+)
+
+//
+// StorageOrder
+//
+
+//
+// Types that are used to store order permutation in packed format.
+//
+
+using StorageOrder64 = uint64_t;
+using StorageOrder32 = uint32_t;
+
+// High-order digit excluded.
+const int MAX_DIMS_64 = std::numeric_limits<StorageOrder64>::digits / 4 - 1;
+
+const int MAX_DIMS_32 = std::numeric_limits<StorageOrder32>::digits / 4;
+
+//
+// DimValues
+//
+
+//
+// Container to store dimensions values (sizes, offsets, strides).
+// Internally it is a map from Dim to `int`.
+// Should be used together with DimsOrder to get the permutation array.
+//
+
+template <typename T>
+class DimValues_ final {
+    static_assert(std::is_trivial<T>::value, "std::is_trivial<T>::value");
+
+    using ValuesCont = std::array<std::pair<Dim, T>, MAX_DIMS_64>;
+    using FlagsCont = std::array<bool, MAX_DIMS_64>;
+
+public:
+    template <bool IsConst>
+    class Iterator final {
+    public:
+        using ValuesContInner = typename std::conditional<IsConst, const ValuesCont, ValuesCont>::type;
+        using FlagsContInner = const FlagsCont;
+
+        using value_type = typename std::conditional<IsConst, const std::pair<Dim, T>, std::pair<Dim, T>>::type;
+        using pointer = value_type*;
+        using reference = value_type&;
+        using iterator_category = std::bidirectional_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+
+        Iterator() = default;
+
+        Iterator(const Iterator&) = default;
+        Iterator& operator=(const Iterator&) = default;
+
+        Iterator(Dim cur, ValuesContInner& values, FlagsContInner& flags) : _cur(cur), _values(&values), _flags(&flags) {
+            advance();
+        }
+
+        reference operator*() const {
+            auto curInd = static_cast<int32_t>(_cur);
+            IE_ASSERT(curInd >= 0 && curInd < MAX_DIMS_64);
+            IE_ASSERT((*_flags)[curInd]);
+
+            return (*_values)[curInd];
+        }
+
+        Iterator& operator++() {
+            auto curInd = static_cast<int32_t>(_cur);
+            IE_ASSERT(curInd >= 0 && curInd < MAX_DIMS_64);
+            IE_ASSERT((*_flags)[curInd]);
+
+            _cur = static_cast<Dim>(static_cast<int32_t>(_cur) + 1);
+            advance();
+            return *this;
+        }
+        Iterator operator++(int) {
+            auto curInd = static_cast<int32_t>(_cur);
+            IE_ASSERT(curInd >= 0 && curInd < MAX_DIMS_64);
+            IE_ASSERT((*_flags)[curInd]);
+
+            auto tmp(*this);
+            _cur = static_cast<Dim>(static_cast<int32_t>(_cur) + 1);
+            advance();
+            return tmp;
+        }
+
+        Iterator& operator--() {
+            auto curInd = static_cast<int32_t>(_cur);
+            IE_ASSERT(curInd >= 0 && curInd < MAX_DIMS_64);
+            IE_ASSERT((*_flags)[curInd]);
+
+            _cur = static_cast<Dim>(static_cast<int32_t>(_cur) - 1);
+            moveBack();
+            return *this;
+        }
+        Iterator operator--(int) {
+            auto curInd = static_cast<int32_t>(_cur);
+            IE_ASSERT(curInd >= 0 && curInd < MAX_DIMS_64);
+            IE_ASSERT((*_flags)[curInd]);
+
+            auto tmp(*this);
+            _cur = static_cast<Dim>(static_cast<int32_t>(_cur) - 1);
+            moveBack();
+            return tmp;
+        }
+
+        bool operator==(const Iterator& other) const { return _cur == other._cur; }
+        bool operator!=(const Iterator& other) const { return _cur != other._cur; }
+
+    private:
+        void advance() {
+            auto curInd = static_cast<int32_t>(_cur);
+            while (curInd >= 0 && curInd < MAX_DIMS_64 && !(*_flags)[curInd]) {
+                ++curInd;
+            }
+
+            if (curInd == MAX_DIMS_64) {
+                curInd = -1;
+            }
+
+            _cur = static_cast<Dim>(curInd);
+        }
+
+        void moveBack() {
+            auto curInd = static_cast<int32_t>(_cur);
+            while (curInd >= 0 && curInd < MAX_DIMS_64 && !(*_flags)[curInd]) {
+                --curInd;
+            }
+
+            _cur = static_cast<Dim>(curInd);
+        }
+
+    private:
+        Dim _cur = Dim::Invalid;
+        ValuesContInner* _values;
+        FlagsContInner* _flags;
+    };
+
+    using value_type = std::pair<Dim, T>;
+    using iterator = Iterator<false>;
+    using const_iterator = Iterator<true>;
+
+    DimValues_() {
+        _flags.fill(false);
+    }
+    explicit DimValues_(std::initializer_list<value_type> data) {
+        _flags.fill(false);
+
+        for (const auto& p : data) {
+            auto ind = static_cast<int32_t>(p.first);
+            IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+            IE_ASSERT(!_flags[ind]);
+
+            _values[ind] = p;
+            _flags[ind] = true;
+        }
+
+        _size = data.size();
+    }
+
+    DimValues_(const DimValues_&) = default;
+    DimValues_& operator=(const DimValues_&) = default;
+
+    size_t size() const { return _size; }
+    bool empty() const { return _size == 0; }
+
+    void clear() {
+        _flags.fill(false);
+        _size = 0;
+    }
+    void erase(Dim d) {
+        auto ind = static_cast<int32_t>(d);
+        IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+
+        if (_flags[ind]) {
+            IE_ASSERT(_size > 0);
+
+            _flags[ind] = false;
+            --_size;
+        }
+    }
+
+    bool has(Dim d) const {
+        auto ind = static_cast<int32_t>(d);
+        IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+
+        return _flags[ind];
+    }
+
+    const T& operator[](Dim d) const {
+        auto ind = static_cast<int32_t>(d);
+        IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+        IE_ASSERT(_flags[ind]);
+
+        return _values[ind].second;
+    }
+    const T& get(Dim d, const T& def) const {
+        auto ind = static_cast<int32_t>(d);
+        IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+
+        return _flags[ind] ? _values[ind].second : def;
+    }
+
+    void set(Dim d, const T& val) {
+        auto ind = static_cast<int32_t>(d);
+        IE_ASSERT(ind >= 0 && ind < MAX_DIMS_64);
+
+        if (!_flags[ind]) {
+            _flags[ind] = true;
+            ++_size;
+        }
+
+        _values[ind] = std::make_pair(d, val);
+    }
+
+    iterator begin() { return iterator(Dim::W, _values, _flags); }
+    iterator end() { return iterator(Dim::Invalid, _values, _flags); }
+
+    const_iterator begin() const { return const_iterator(Dim::W, _values, _flags); }
+    const_iterator end() const { return const_iterator(Dim::Invalid, _values, _flags); }
+
+    const_iterator cbegin() const { return const_iterator(Dim::W, _values, _flags); }
+    const_iterator cend() const { return const_iterator(Dim::Invalid, _values, _flags); }
+
+    std::array<T, MAX_DIMS_64> toVector(const T& emptyValue) const {
+        std::array<T, MAX_DIMS_64> out;
+        out.fill(emptyValue);
+
+        for (int ind = 0; ind < MAX_DIMS_64; ++ind) {
+            if (_flags[ind]) {
+                out[ind] = _values[ind].second;
+            }
+        }
+
+        return out;
+    }
+
+    bool operator==(const DimValues_& other) const {
+        for (int ind = 0; ind < MAX_DIMS_64; ++ind) {
+            if (_flags[ind] != other._flags[ind]) {
+                return false;
+            }
+            if (_flags[ind] && _values[ind].second != other._values[ind].second) {
+                return false;
+            }
+        }
+        return true;
+    }
+    bool operator!=(const DimValues_& other) const {
+        for (int ind = 0; ind < MAX_DIMS_64; ++ind) {
+            if (_flags[ind] != other._flags[ind]) {
+                return true;
+            }
+            if (_flags[ind] && _values[ind].second != other._values[ind].second) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void printTo(std::ostream& os) const {
+        os << "[";
+
+        int realInd = 0;
+        for (int ind = 0; ind < MAX_DIMS_64; ++ind) {
+            if (_flags[ind]) {
+                vpu::printTo(os, _values[ind].first);
+                os << " : ";
+                vpu::printTo(os, _values[ind].second);
+                if (realInd + 1 < _size) {
+                    os << ", ";
+                }
+                ++realInd;
+            }
+        }
+
+        os << "]";
+    }
+
+private:
+    ValuesCont _values = {};
+    FlagsCont _flags;
+    size_t _size = 0;
+};
+
+template <typename T>
+void printTo(std::ostream& os, const DimValues_<T>& dims) {
+    dims.printTo(os);
+}
+
+using DimValues = DimValues_<int>;
+
+//
+// DimsOrder
+//
+
+StorageOrder64 maskOrder(StorageOrder64 fullOrder, int size);
+
+class DimsOrder final {
+public:
+    //
+    // Predefined orders
+    //
+
+    static DimsOrder C;
+    static DimsOrder NC;
+    static DimsOrder CHW;
+    static DimsOrder HWC;
+    static DimsOrder HCW;
+    static DimsOrder NCHW;
+    static DimsOrder NHWC;
+    static DimsOrder NHCW;
+
+    //
+    // Constructor
+    //
+
+    DimsOrder() = default;
+    static DimsOrder fromCode(StorageOrder64 code);
+    static DimsOrder fromNumDims(int numDims);
+    static DimsOrder fromPermutation(const std::vector<Dim>& perm);
+
+    //
+    // Accessors
+    //
+
+    bool empty() const { return _code == 0; }
+
+    int numDims() const;
+
+    bool hasDim(Dim d) const;
+    int dimInd(Dim d) const;
+
+    StorageOrder64 code() const { return _code; }
+
+    //
+    // Information about dimension order
+    //
+
+    // Convert from packed format to array of dimensions from minor to major.
+    std::vector<Dim> toPermutation() const;
+
+    // Get memory indeces for each dimension.
+    DimValues toIndices() const;
+
+    //
+    // Relayout helpers
+    //
+
+    // In-place modification.
+    void moveDim(Dim dim, int newPos);
+
+    // Makes new object.
+    DimsOrder createMovedDim(Dim dim, int newPos) const;
+
+private:
+    StorageOrder64 _code = 0;
+};
+
+bool isOrdersCompatible(DimsOrder order1, DimsOrder order2);
+
+inline bool operator==(DimsOrder order1, DimsOrder order2) {
+    return order1.code() == order2.code();
+}
+inline bool operator!=(DimsOrder order1, DimsOrder order2) {
+    return order1.code() != order2.code();
+}
+
+void printTo(std::ostream& os, DimsOrder order);
+
+struct DimsOrderHash final {
+    size_t operator()(DimsOrder order) const {
+        return std::hash<StorageOrder64>()(order.code());
+    }
+};
+
+using DimsOrderSet = std::unordered_set<DimsOrder, DimsOrderHash>;
+template <typename Val>
+using DimsOrderMap = std::unordered_map<DimsOrder, Val, DimsOrderHash>;
+
+//
+// DataDesc
+//
+
+class DataDesc final {
+public:
+    //
+    // Constructors
+    //
+
+    DataDesc() = default;
+
+    template <typename IntValue, typename = typename std::enable_if<std::is_integral<IntValue>::value>::type>
+    DataDesc(DataType type, DimsOrder dimsOrder, std::initializer_list<IntValue> dims) :
+            _type(type), _dimsOrder(dimsOrder) {
+        auto perm = _dimsOrder.toPermutation();
+        IE_ASSERT(dims.size() == perm.size());
+
+        int ind = 0;
+        for (auto val : dims) {
+            _dims.set(perm[ind], val);
+            ++ind;
+        }
+    }
+
+    template <typename IntValue, typename = typename std::enable_if<std::is_integral<IntValue>::value>::type>
+    DataDesc(DimsOrder dimsOrder, std::initializer_list<IntValue> dims) : DataDesc(DataType::FP16, dimsOrder, dims) {}
+
+    template <typename IntValue, typename = typename std::enable_if<std::is_integral<IntValue>::value>::type>
+    explicit DataDesc(std::initializer_list<IntValue> dims) : DataDesc(DataType::FP16, DimsOrder::fromNumDims(dims.size()), dims) {}
+
+    explicit DataDesc(const ie::TensorDesc& ieDesc);
+
+    DataDesc(DataType type, DimsOrder dimsOrder, const DimValues& dims);
+
+    //
+    // DataType
+    //
+
+    DataType type() const { return _type; }
+
+    void setType(DataType type) { _type = type; }
+
+    int elemSize() const;
+
+    //
+    // Dims
+    //
+
+    int numDims() const { return _dimsOrder.numDims(); }
+
+    const DimValues& dims() const { return _dims; }
+
+    int dim(Dim d) const { return _dims[d]; }
+    int dim(Dim d, int defVal) const { return _dims.has(d) ? _dims[d] : defVal; }
+
+    void setDim(Dim d, int val);
+
+    int totalDimSize() const;
+
+    //
+    // DimsOrder
+    //
+
+    DimsOrder dimsOrder() const { return _dimsOrder; }
+
+    void moveDim(Dim dim, int newPos) {
+        _dimsOrder.moveDim(dim, newPos);
+    }
+
+    void reorder(DimsOrder dimsOrder);
+
+private:
+    DataType _type = DataType::FP16;
+    DimsOrder _dimsOrder;
+    DimValues _dims;
+};
+
+void printTo(std::ostream& os, const DataDesc& desc);
+void printTo(DotLabel& lbl, const DataDesc& desc);
+
+//
+// DimStride
+//
+
+VPU_DECLARE_ENUM(DimStride,
+    Any,
+    Compact,
+    Aligned
+)
+
+const int STRIDE_ALIGNMENT = 16;
+
+//
+// StridesRequirement
+//
+
+//
+// Container for stride requirement per each dimensions (in memory order).
+//
+
+class StridesRequirement final {
+public:
+    StridesRequirement() { _map[0] = DimStride::Compact; }
+
+    static StridesRequirement empty() { return StridesRequirement().add(0, DimStride::Any); }
+    static StridesRequirement compact();
+
+    StridesRequirement& add(int index, DimStride stride) {
+        IE_ASSERT(index >= 0 && index < MAX_DIMS_64);
+        _map[index] = stride;
+        return *this;
+    }
+
+    StridesRequirement& remove(int index) {
+        IE_ASSERT(index >= 0 && index < MAX_DIMS_64);
+        _map[index] = DimStride::Any;
+        return *this;
+    }
+
+    DimStride get(int index) const {
+        IE_ASSERT(index >= 0 && index < MAX_DIMS_64);
+        return _map[index];
+    }
+
+    bool operator==(const StridesRequirement& other) const {
+        return (_map == other._map);
+    }
+    bool operator!=(const StridesRequirement& other) const {
+        return (_map != other._map);
+    }
+
+private:
+    std::array<DimStride, MAX_DIMS_64> _map{{DimStride::Any}};
+};
+
+void printTo(std::ostream& os, const StridesRequirement& reqs);
+void printTo(DotLabel& lbl, const StridesRequirement& reqs);
+
+DimValues calcStrides(const DataDesc& desc, const StridesRequirement& reqs);
+
+bool checkStride(
+        const DimValues& strides,
+        const DataDesc& desc,
+        int ind,
+        DimStride req);
+bool checkStrides(
+        const DataDesc& desc,
+        const DimValues& strides,
+        const StridesRequirement& reqs);
+
+int calcTotalByteSize(const DataDesc& desc, const DimValues& strides);
+
+//
+// BatchSupport
+//
+
+VPU_DECLARE_ENUM(BatchSupport,
+    Split,
+    ReplicateConstContent
+)
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/edges.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/edges.hpp
new file mode 100644 (file)
index 0000000..4d5af66
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/base.hpp>
+
+namespace vpu {
+
+//
+// StageInputEdge
+//
+
+//
+// Data -> Stage edge.
+//
+
+class StageInputEdge final :
+        public EnableHandleFromThis<StageInputEdge>,
+        public EnableCustomAttributes {
+    VPU_MODEL_ATTRIBUTE(Data, input, nullptr)
+    VPU_MODEL_ATTRIBUTE(Stage, consumer, nullptr)
+    VPU_MODEL_ATTRIBUTE(int, portInd, -1)
+    VPU_MODEL_ATTRIBUTE(StageInput, parentEdge, nullptr)
+    VPU_MODEL_ATTRIBUTE(StageInput, childEdge, nullptr)
+
+private:
+    StageInputEdge() : _posInData(this) {}
+
+private:
+    Handle<Model> _model;
+    StageInputPtrList::iterator _ptrPosInModel;
+    IntrusivePtrListNode<StageInputEdge> _posInData;
+
+    friend class Model;
+    friend class DataNode;
+};
+
+//
+// StageOutputEdge
+//
+
+//
+// Stage -> Data edge.
+//
+
+class StageOutputEdge final :
+        public EnableHandleFromThis<StageOutputEdge>,
+        public EnableCustomAttributes {
+    VPU_MODEL_ATTRIBUTE(Stage, producer, nullptr)
+    VPU_MODEL_ATTRIBUTE(Data, output, nullptr)
+    VPU_MODEL_ATTRIBUTE(int, portInd, -1)
+    VPU_MODEL_ATTRIBUTE(StageOutput, parentEdge, nullptr)
+    VPU_MODEL_ATTRIBUTE(StageOutput, childEdge, nullptr)
+
+private:
+    Handle<Model> _model;
+    StageOutputPtrList::iterator _ptrPosInModel;
+
+    friend class Model;
+};
+
+//
+// StageTempBufferEdge
+//
+
+class StageTempBufferEdge final :
+        public EnableHandleFromThis<StageTempBufferEdge>,
+        public EnableCustomAttributes {
+    VPU_MODEL_ATTRIBUTE(Stage, stage, nullptr)
+    VPU_MODEL_ATTRIBUTE(Data, tempBuffer, nullptr)
+    VPU_MODEL_ATTRIBUTE(int, portInd, -1)
+    VPU_MODEL_ATTRIBUTE(StageTempBuffer, parentEdge, nullptr)
+    VPU_MODEL_ATTRIBUTE(StageTempBuffer, childEdge, nullptr)
+
+private:
+    Handle<Model> _model;
+    StageTempBufferPtrList::iterator _ptrPosInModel;
+
+    friend class Model;
+};
+
+//
+// SharedAllocationEdge
+//
+
+//
+// Data <-> Data edges - used to share memory buffer between Data objects.
+// Parent Data object owns the memory, while child reuses it.
+//
+// SharedDataMode defines the relationship between the Data objects:
+//    * ROI : child is a sub-tensor of parent.
+//      They have the same layout and strides, but child has smaller dimensions.
+//    * Reshape : used for Reshape operation.
+//      Child shares the same memory buffer, but has completely different layout.
+//
+// SharedDataOrder defined the Data flow order between parent and child.
+//    * ParentWritesToChild :
+//      (Producer) -> [Parent] -> [Child] -> (Consumer)
+//    * ChildWritesToParent :
+//      (Producer) -> [Child] -> [Parent] -> (Consumer)
+//
+
+VPU_DECLARE_ENUM(SharedDataMode,
+    ROI,
+    Reshape)
+
+VPU_DECLARE_ENUM(SharedDataOrder,
+    ParentWritesToChild,
+    ChildWritesToParent)
+
+class SharedAllocationEdge final :
+        public EnableHandleFromThis<SharedAllocationEdge>,
+        public EnableCustomAttributes {
+    VPU_MODEL_ATTRIBUTE(Data, parent, nullptr)
+    VPU_MODEL_ATTRIBUTE(Data, child, nullptr)
+    VPU_MODEL_ATTRIBUTE(Stage, connection, nullptr)
+    VPU_MODEL_ATTRIBUTE(SharedDataMode, mode, SharedDataMode::ROI)
+    VPU_MODEL_ATTRIBUTE(SharedDataOrder, order, SharedDataOrder::ParentWritesToChild)
+
+private:
+    SharedAllocationEdge() : _posInData(this) {}
+
+private:
+    Handle<Model> _model;
+    SharedAllocationPtrList::iterator _ptrPosInModel;
+    IntrusivePtrListNode<SharedAllocationEdge> _posInData;
+
+    friend class Model;
+    friend class DataNode;
+};
+
+//
+// InjectedStageEdge
+//
+
+//
+// Stage <-> Stage edges - used to inject SW operations into HW
+//
+
+class InjectedStageEdge final :
+        public EnableHandleFromThis<InjectedStageEdge>,
+        public EnableCustomAttributes {
+    VPU_MODEL_ATTRIBUTE(Stage, parent, nullptr)
+    VPU_MODEL_ATTRIBUTE(StagePtr, child, nullptr)
+    VPU_MODEL_ATTRIBUTE(int, portInd, -1)
+
+private:
+    InjectedStageEdge() : _posInStage(this) {}
+
+private:
+    Handle<Model> _model;
+    InjectedStagePtrList::iterator _ptrPosInModel;
+    IntrusivePtrListNode<InjectedStageEdge> _posInStage;
+
+    friend class Model;
+    friend class StageNode;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/model.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/model.hpp
new file mode 100644 (file)
index 0000000..22af3de
--- /dev/null
@@ -0,0 +1,370 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <functional>
+#include <set>
+
+#include <ie_icnn_network.hpp>
+
+#include <vpu/model/base.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/allocator.hpp>
+
+namespace vpu {
+
+//
+// Resources
+//
+
+// TODO: get rid of `cmxLimit`.
+
+struct Resources final {
+    int numCMXSlices = 0;
+    int numSHAVEs = 0;
+    int cmxLimit = 0;
+};
+
+void printTo(std::ostream& os, const Resources& res);
+void printTo(DotLabel& lbl, const Resources& res);
+
+//
+// Model
+//
+
+VPU_DECLARE_ENUM(BuildStageOrder,
+                 DFS,
+                 BFS)
+
+class Model final :
+        public EnableHandleFromThis<Model>,
+        public EnableCustomAttributes {
+private:
+    // Need to declare here to use decltype
+    DataList _dataList;
+    mutable StageList _orderedStageList;
+
+    struct SubGraphFilter final {
+        int subGraphNumber = -1;
+
+        inline bool operator()(const Stage& stage) const {
+            return stage->subGraphNumber() == subGraphNumber;
+        }
+    };
+
+    //
+    // Main attributes
+    //
+
+    VPU_MODEL_ATTRIBUTE(std::string, name, std::string())
+
+    VPU_MODEL_ATTRIBUTE(int, batchSize, 1)
+
+    VPU_MODEL_ATTRIBUTE(InferenceEngine::NetworkStatsMap, nodesStats, {})
+
+    VPU_MODEL_ATTRIBUTE(int, numberOfSubGraphs, 1)
+
+public:
+    using Ptr = ModelPtr;
+
+    //
+    // Constructor
+    //
+
+    inline explicit Model(const std::string& name) :
+            _dataList(&DataNode::_posInModel),
+            _orderedStageList(&StageNode::_posInModel),
+            _name(name) {
+    }
+
+    //
+    // Main attributes
+    //
+
+    void setBatchSize(int batchSize);
+
+    inline void setNodesStats(const ie::NetworkStatsMap& stats) { _nodesStats = stats; }
+
+    void setNumberOfSubGraphs(int numberOfSubGraphs);
+
+    //
+    // Data nodes
+    //
+
+    Data addInputData(
+            const std::string& name,
+            const DataDesc& desc);
+
+    Data addOutputData(
+            const std::string& name,
+            const DataDesc& desc);
+
+    Data addConstData(
+            const std::string& name,
+            const DataDesc& desc,
+            const DataContent::Ptr& content);
+
+    Data addNewData(
+            const std::string& name,
+            const DataDesc& desc);
+
+    Data addFakeData();
+
+    Data duplicateData(
+            const Data& origData,
+            const std::string& postfix,
+            const DataDesc& newDesc = DataDesc(),
+            const DataContent::Ptr& newContent = nullptr);
+
+    //
+    // Stage nodes
+    //
+
+    template <class StageImpl>
+    Stage addNewStage(
+            const std::string& name,
+            StageType type,
+            const ie::CNNLayerPtr& origLayer,
+            const DataVector& inputs,
+            const DataVector& outputs);
+
+    Stage duplicateStage(
+            const std::string& name,
+            const Stage& origStage,
+            const DataVector& inputs,
+            const DataVector& outputs);
+
+    //
+    // Stage <-> Data edges
+    //
+
+    StageInput addStageInput(
+            const Stage& stage,
+            const Data& data);
+
+    StageOutput addStageOutput(
+            const Stage& stage,
+            const Data& data);
+
+    StageTempBuffer addTempBuffer(
+            const Stage& stage,
+            const DataDesc& desc);
+
+    void replaceStageInput(
+            const StageInput& edge,
+            const Data& newInput);
+
+    void replaceStageOutput(
+            const StageOutput& edge,
+            const Data& newOutput);
+
+    //
+    // Stage <-> Stage edges
+    //
+
+    class InjectStageHelper final {
+    public:
+        inline InjectStageHelper(InjectStageHelper&&) = default;
+
+        InjectStageHelper(const InjectStageHelper&) = delete;
+        InjectStageHelper& operator=(const InjectStageHelper&) = delete;
+        InjectStageHelper& operator=(InjectStageHelper&&) = delete;
+
+        ~InjectStageHelper();
+
+        InjectStageHelper& parentHW(const Stage& parent);
+        InjectStageHelper& childSW(const Stage& child);
+
+        InjectedStage done();
+
+    private:
+        inline explicit InjectStageHelper(const Handle<Model>& model) : _model(model) {}
+
+    private:
+        Handle<Model> _model;
+
+        Stage _parent;
+        Stage _child;
+
+        friend class Model;
+    };
+
+    inline InjectStageHelper injectStage() { return InjectStageHelper(handle_from_this()); }
+
+    void revertInjection(const InjectedStage& edge);
+
+    //
+    // Data<->Data edges
+    //
+
+    class DataEdgeHelper final {
+    public:
+        inline DataEdgeHelper(DataEdgeHelper&&) = default;
+
+        DataEdgeHelper(const DataEdgeHelper&) = delete;
+        DataEdgeHelper& operator=(const DataEdgeHelper&) = delete;
+        DataEdgeHelper& operator=(DataEdgeHelper&&) = delete;
+
+        ~DataEdgeHelper();
+
+        DataEdgeHelper& parent(const Data& parent);
+        DataEdgeHelper& child(const Data& child);
+
+        DataEdgeHelper& mode(SharedDataMode mode);
+        DataEdgeHelper& order(SharedDataOrder order);
+
+        DataEdgeHelper& offset(const DimValues& offset);
+
+        SharedAllocation done();
+
+    private:
+        inline explicit DataEdgeHelper(const Handle<Model>& model) : _model(model) {}
+
+    private:
+        Handle<Model> _model;
+
+        Data _parent;
+        Data _child;
+
+        SharedDataMode _mode = SharedDataMode::ROI;
+        bool _modeSet = false;
+
+        SharedDataOrder _order = SharedDataOrder::ParentWritesToChild;
+        bool _orderSet = false;
+
+        DimValues _offset;
+        bool _offsetSet = false;
+
+        friend class Model;
+    };
+
+    inline DataEdgeHelper connectDatas() { return DataEdgeHelper(handle_from_this()); }
+
+    //
+    // Nodes removal
+    //
+
+    void disconnectStageDatas(const Stage& stage);
+
+    void removeStage(const Stage& stage);
+
+    void removeUnusedData(const Data& data);
+
+    void cleanUpDatas();
+
+    //
+    // Stage order
+    //
+
+    // TODO: allow to override stage order.
+    void buildStageOrder(BuildStageOrder order = BuildStageOrder::DFS) const;
+
+    //
+    // Nodes accessors
+    //
+
+    inline int numDatas() const { return _dataPtrList.size(); }
+    inline auto datas() const -> decltype(contRange(_dataList)) {
+        return contRange(_dataList);
+    }
+
+    inline int numStages() const { return _stagePtrList.size(); }
+    inline auto getStages(BuildStageOrder order = BuildStageOrder::DFS) const -> decltype(contRange(_orderedStageList)) {
+        buildStageOrder(order);
+        return contRange(_orderedStageList);
+    }
+    inline auto getSubGraphStages(int subGraphNumber) const -> decltype(filterRange<SubGraphFilter>(getStages(BuildStageOrder::DFS))) {
+        SubGraphFilter f;
+        f.subGraphNumber = subGraphNumber;
+        return filterRange(getStages(), f);
+    }
+
+    //
+    // Allocator
+    //
+
+    inline Allocator& getAllocator() { return _allocator; }
+
+private:
+    Stage addNewStageImpl(
+        const std::string& name,
+        StageType type,
+        const ie::CNNLayerPtr& origLayer,
+        const DataVector& inputs,
+        const DataVector& outputs,
+        const FuncRef<StagePtr()>& creator);
+
+    InjectedStage injectStageImpl(
+            const Stage& parent,
+            const Stage& child);
+
+    SharedAllocation connectDatasImpl(
+            const Data& parent,
+            const Data& child,
+            SharedDataMode mode,
+            SharedDataOrder order,
+            const DimValues& offset);
+
+    void runDFS(
+            const Stage& stage,
+            StageMap<bool>& visitedMap) const;
+
+    void runBFS(
+            StageList& queue,
+            StageMap<bool>& visitedMap) const;
+
+private:
+    DataPtrList _dataPtrList;
+    StagePtrList _stagePtrList;
+
+    StageInputPtrList _inEdgePtrList;
+    StageOutputPtrList _outEdgePtrList;
+    StageTempBufferPtrList _tempBufferEdgePtrList;
+    SharedAllocationPtrList _dataEdgePtrList;
+    InjectedStagePtrList _stageEdgePtrList;
+
+    Allocator _allocator;
+
+    std::set<Stage, StageNode::StageCmp> _initialStages;
+
+    mutable bool _resetStageOrder = true;
+    mutable BuildStageOrder _stageOrder = BuildStageOrder::DFS;
+
+    friend class InjectStageHelper;
+    friend class DataEdgeHelper;
+};
+
+template <class StageImpl>
+inline Stage Model::addNewStage(
+        const std::string& name,
+        StageType type,
+        const ie::CNNLayerPtr& origLayer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    return addNewStageImpl(
+        name,
+        type,
+        origLayer,
+        inputs,
+        outputs,
+        []() { return std::make_shared<StageImpl>(); });
+}
+
+//
+// runAllocator
+//
+
+AllocationResult runAllocator(
+        const Model::Ptr& model,
+        bool onlyCheckCMX = false);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/stage.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/stage.hpp
new file mode 100644 (file)
index 0000000..34308c2
--- /dev/null
@@ -0,0 +1,450 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <utility>
+
+#include <ie_layers.h>
+
+#include <vpu/model/base.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/model/data_desc.hpp>
+#include <vpu/backend/blob_serializer.hpp>
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+//
+// StageType
+//
+
+// Must be synchronized with MvTensor
+VPU_DECLARE_ENUM(StageType,
+    //
+    // This is special operations, that are not present in MvTensor,
+    // but are used for internal Model processing and optimization.
+    //
+
+    __SPECIAL_START__ = -100000,
+
+    //
+    // Stages that have both HW and SW variants.
+    // This is stubs that will be replaced with concrete implementation during the model optimization.
+    //
+
+    StubConv,
+    StubMaxPool,
+    StubAvgPool,
+    StubFullyConnected,
+    StubDeconv,
+
+    Concat,
+    Split,
+    Reshape,
+    Expand,
+    Shrink,
+
+    //
+    // Normal operations
+    //
+
+    Conv = 0,
+    MaxPool = 1,
+    AvgPool = 2,
+    SoftMax = 3,
+    FC = 4,
+    None = 5,
+    Relu = 6,
+    DepthConv = 8,
+    Bias = 9,
+    PRelu = 10,
+    LRN = 11,
+    Sum = 12,
+    Prod = 13,
+    Max = 14,
+    Scale = 15,
+    InnerLRN = 18,
+    Copy = 19,
+    Sigmoid = 20,
+    Tanh = 21,
+    Deconvolution = 22,
+    Elu = 23,
+    Power = 26,
+    Crop = 27,
+    Tile = 28,
+    RegionYolo = 29,
+    ReorgYolo = 30,
+    Convert_u8f16 = 31,
+    Convert_f32f16 = 32,
+    Convert_f16f32 = 33,
+    Permute = 34,
+    Normalize = 35,
+    DetectionOutput = 37,
+    MyriadXHwOp = 38,
+    CTCDecoder = 43,
+    LeakyRelu = 44,
+    BiasRelu = 45,
+    BiasLeakyRelu = 46,
+    ScaleShift = 47,
+    Im2ColConvolution = 49,
+    HwFcRelayout = 56,
+    Clamp = 57,
+    RefConvolution = 58,
+    GlobalAvgPool = 59,
+    GlobalMaxPool = 60,
+    GRN = 61,
+    MVN = 62,
+    DepthDeconv = 63,
+    Proposal = 64,
+    ROIPooling = 65,
+    PSROIPooling = 66,
+    Interp = 67,
+    Custom = 68,
+    MTCNN = 69,
+    LSTMCell = 70,
+    Pad = 71,
+    Resample = 72,
+    Upsampling = 73,
+    ArgMax = 74,
+)
+
+//
+// StageCategory
+//
+
+VPU_DECLARE_ENUM(StageCategory,
+    SHAVE,
+    HW,
+    DMA,
+    Special)
+
+//
+// PadMode
+//
+
+// Must be aligned with ie::PadLayer::ePadMode
+VPU_DECLARE_ENUM(PadMode,
+    Constant = 0,
+    Edge = 1,
+    Reflect = 2,
+    Symmetric = 3)
+
+//
+// StageSHAVEsRequirements
+//
+
+VPU_DECLARE_ENUM(StageSHAVEsRequirements,
+    NotNeeded,
+    OnlyOne,
+    TwoOrOne,
+    CanBeLimited,
+    NeedMax
+);
+
+//
+// StageNode
+//
+
+VPU_DECLARE_ENUM(ScalePropagationStep,
+    Check,
+    ScaleInput,
+    Propagate
+);
+
+class StageNode :
+        public EnableHandleFromThis<StageNode>,
+        public EnableCustomAttributes {
+    //
+    // Main attributes
+    //
+
+    VPU_MODEL_ATTRIBUTE(std::string, name, "")
+    VPU_MODEL_ATTRIBUTE(StageType, type, StageType::None)
+    VPU_MODEL_ATTRIBUTE(int, index, -1)
+    VPU_MODEL_ATTRIBUTE(int, subGraphNumber, -1)
+
+    //
+    // Bindings with IE
+    //
+
+    VPU_MODEL_ATTRIBUTE(ie::CNNLayerPtr, origLayer, nullptr)
+
+    //
+    // Edges
+    //
+
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(StageInputVector, inputEdges)
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(StageOutputVector, outputEdges)
+
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(StageTempBufferVector, tempBufferEdges)
+
+    VPU_MODEL_ATTRIBUTE(InjectedStage, parentStageEdge, nullptr)
+    VPU_MODEL_ATTRIBUTE_PTR_RANGE(InjectedStageList, injectedStageEdges)
+
+    //
+    // SHAVEs allocation
+    //
+
+    VPU_MODEL_ATTRIBUTE(int, numSHAVEs, 0)
+
+    //
+    // Edges wrappers
+    //
+
+private:
+    struct InputAccess final {
+        inline auto operator()(const StageInput& edge) const -> decltype(edge->input()) {
+            return edge->input();
+        }
+    };
+
+    struct OutputAccess final {
+        inline auto operator()(const StageOutput& edge) const -> decltype(edge->output()) {
+            return edge->output();
+        }
+    };
+
+    struct ProducerAccess final {
+        inline auto operator()(const Data& data) const -> decltype(data->producer()) {
+            return data->producer();
+        }
+    };
+
+    struct ConsumersAccess final {
+        inline auto operator()(const Data& data) const -> decltype(data->consumers()) {
+            return data->consumers();
+        }
+    };
+
+    struct TempBufferAccess final {
+        inline auto operator()(const StageTempBuffer& edge) const -> decltype(edge->tempBuffer()) {
+            return edge->tempBuffer();
+        }
+    };
+
+    struct InjectedStageAccess final {
+        inline auto operator()(const InjectedStage& edge) const -> decltype(edge->child()) {
+            return edge->child();
+        }
+    };
+
+    struct StageCmp final {
+        inline bool operator()(const Stage& left, const Stage& right) const {
+            auto res = left->name().compare(right->name());
+            if (res > 0) {
+                return true;
+            }
+            if (res < 0) {
+                return false;
+            }
+            // Different Stage nodes might have equal names, compare pointers instead
+            return reinterpret_cast<uintptr_t>(left.get()) > reinterpret_cast<uintptr_t>(right.get());
+        }
+    };
+
+    // It holds the number of separate edges per each prev/next stage
+    using StageOrderMap = std::map<Stage, int, StageCmp>;
+
+    struct StageOrderMapAccess final {
+        inline const Stage& operator()(const StageOrderMap::value_type& p) const {
+            return p.first;
+        }
+    };
+
+    StageOrderMap _prevStages;
+    StageOrderMap _nextStages;
+
+public:
+    inline int numInputs() const { return _inputEdges.size(); }
+    inline StageInput inputEdge(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _inputEdges.size());
+        return _inputEdges[ind];
+    }
+    inline Data input(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _inputEdges.size());
+        return _inputEdges[ind]->input();
+    }
+    inline auto inputs() const -> decltype(mapRange<InputAccess>(inputEdges())) {
+        return mapRange<InputAccess>(inputEdges());
+    }
+
+    inline int numOutputs() const { return _outputEdges.size(); }
+    inline StageOutput outputEdge(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _outputEdges.size());
+        return _outputEdges[ind];
+    }
+    inline Data output(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _outputEdges.size());
+        return _outputEdges[ind]->output();
+    }
+    inline auto outputs() const -> decltype(mapRange<OutputAccess>(outputEdges())) {
+        return mapRange<OutputAccess>(outputEdges());
+    }
+
+    inline auto prevStages() const -> decltype(mapRange<StageOrderMapAccess>(contRange(_prevStages))) {
+        return mapRange<StageOrderMapAccess>(contRange(_prevStages));
+    }
+    template <class Cond>
+    inline auto prevStages(Cond&& cond) -> decltype(filterRange(prevStages(), std::forward<typename std::remove_reference<Cond>::type>(cond))) {
+        return filterRange(prevStages(), std::forward<typename std::remove_reference<Cond>::type>(cond));
+    }
+    inline auto nextStages() const -> decltype(mapRange<StageOrderMapAccess>(contRange(_nextStages))) {
+        return mapRange<StageOrderMapAccess>(contRange(_nextStages));
+    }
+    template <class Cond>
+    inline auto nextStages(Cond&& cond) -> decltype(filterRange(nextStages(), std::forward<typename std::remove_reference<Cond>::type>(cond))) {
+        return filterRange(nextStages(), std::forward<typename std::remove_reference<Cond>::type>(cond));
+    }
+
+    inline int numTempBuffers() const { return _tempBufferEdges.size(); }
+    inline StageTempBuffer tempBufferEdge(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _tempBufferEdges.size());
+        return _tempBufferEdges[ind];
+    }
+    inline Data tempBuffer(int ind) const {
+        IE_ASSERT(ind >= 0 && ind < _tempBufferEdges.size());
+        return _tempBufferEdges[ind]->tempBuffer();
+    }
+    inline auto tempBuffers() const -> decltype(mapRange<TempBufferAccess>(tempBufferEdges())) {
+        return mapRange<TempBufferAccess>(tempBufferEdges());
+    }
+
+    inline Stage parentStage() const { return _parentStageEdge == nullptr ? nullptr : _parentStageEdge->parent(); }
+
+    inline int numInjectedStages() const {
+        return _injectedStageEdges.size();
+    }
+    inline auto injectedStages() const -> decltype(mapRange<InjectedStageAccess>(injectedStageEdges())) {
+        return mapRange<InjectedStageAccess>(injectedStageEdges());
+    }
+
+public:
+    inline virtual ~StageNode() = default;
+
+    //
+    // Stage category
+    //
+
+    inline StageCategory category() const {
+        if (static_cast<int>(_type) < 0) {
+            return StageCategory::Special;
+        } else if (_type == StageType::MyriadXHwOp) {
+            return StageCategory::HW;
+        } else if (_type == StageType::Copy) {
+            return StageCategory::DMA;
+        } else {
+            return StageCategory::SHAVE;
+        }
+    }
+
+    //
+    // Bindings with IE
+    //
+
+    inline std::string origLayerName() const { return _origLayer != nullptr ? _origLayer->name : std::string(); }
+
+    //
+    // Main attributes
+    //
+
+    void setSubGraphNumber(int subGraphNumber);
+
+    //
+    // SHAVEs allocation
+    //
+
+    void setNumSHAVEs(int numSHAVEs);
+
+    //
+    // Passes utilities
+    //
+
+    // Scale factor propagation from inputs to outputs
+    DataMap<float> propagateScaleFactors(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step);
+
+    // Data order propagation from inputs to outputs.
+    DataMap<DimsOrder> propagateDataOrder() const;
+
+    // Get Data strides requirements
+    DataMap<StridesRequirement> getDataStridesRequirements() const;
+
+    // Finalize internal parameter to final Data layout.
+    void finalizeDataLayout();
+
+    // Information about batch support.
+    DataMap<BatchSupport> getBatchSupportInfo() const;
+
+    // Resources requirements.
+    StageSHAVEsRequirements getSHAVEsRequirements() const;
+
+    // Final check.
+    void finalCheck() const;
+
+    //
+    // Backend utilities
+    //
+
+    void serialize(BlobSerializer& serializer) const;
+
+protected:
+    //
+    // Interfaces for Stages implementations
+    //
+
+    virtual StagePtr cloneImpl() const = 0;
+
+    virtual DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step);
+
+    virtual DataMap<DimsOrder> propagateDataOrderImpl() const = 0;
+
+    virtual DataMap<StridesRequirement> getDataStridesRequirementsImpl() const = 0;
+
+    virtual void finalizeDataLayoutImpl() = 0;
+
+    virtual DataMap<BatchSupport> getBatchSupportInfoImpl() const = 0;
+
+    virtual StageSHAVEsRequirements getSHAVEsRequirementsImpl() const;
+
+    virtual void finalCheckImpl() const = 0;
+
+    virtual void serializeParamsImpl(BlobSerializer& serializer) const = 0;
+
+    virtual void serializeDataImpl(BlobSerializer& serializer) const = 0;
+
+protected:
+    inline StageNode() :
+            _injectedStageEdges(&InjectedStageEdge::_posInStage),
+            _posInModel(this),
+            _posInBfsQueue(this) {
+    }
+    inline StageNode(const StageNode& other) :
+            EnableCustomAttributes(other),
+            _injectedStageEdges(&InjectedStageEdge::_posInStage),
+            _posInModel(this),
+            _posInBfsQueue(this) {
+    }
+
+protected:
+    Handle<Model> _model;
+
+private:
+    StagePtrList::iterator _ptrPosInModel;
+    IntrusivePtrListNode<StageNode> _posInModel;
+    IntrusivePtrListNode<StageNode> _posInBfsQueue;
+
+    friend class Model;
+};
+
+void printTo(std::ostream& os, const Stage& stage);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/network_config.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/network_config.hpp
new file mode 100644 (file)
index 0000000..a262494
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/graph_transformer.hpp>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <details/caseless.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+class NetworkConfig final {
+public:
+    void parse(const CompilationConfig& config);
+
+    bool skipAllLayers() const;
+    bool skipLayerType(const std::string& layerType) const { return _noneLayers.count(layerType) != 0; }
+
+    bool hasManualDataScale() const { return !_dataScale.empty(); }
+    const std::unordered_map<std::string, float>& dataScale() const { return _dataScale; }
+
+    bool hwDisabled(const std::string& layerName) const;
+
+private:
+    ie::details::caseless_set<std::string> _noneLayers;
+
+    std::unordered_map<std::string, float> _dataScale;
+
+    std::unordered_set<std::string> _hwWhiteList;
+    std::unordered_set<std::string> _hwBlackList;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/parsed_config.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/parsed_config.hpp
new file mode 100644 (file)
index 0000000..0e62c01
--- /dev/null
@@ -0,0 +1,101 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+
+#include <vpu/vpu_plugin_config.hpp>
+#include <vpu/private_plugin_config.hpp>
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/utils/perf_report.hpp>
+#include <vpu/utils/logger.hpp>
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+VPU_DECLARE_ENUM(ConfigMode,
+    DEFAULT_MODE = 0,
+    RUNTIME_MODE = 1,
+    COMPILE_MODE = 2,
+)
+
+struct ParsedConfig {
+    CompilationConfig compileConfig;
+
+    bool printReceiveTensorTime = false;
+    bool exclusiveAsyncRequests = false;
+    bool perfCount              = false;
+    bool forceReset             = false;
+
+    LogLevel vpuLogLevel = LogLevel::None;
+    LogLevel logLevel = LogLevel::None;
+
+    PerfReport perfReport = PerfReport::PerStage;
+
+    std::map<std::string, std::string> getDefaultConfig() const;
+
+    virtual ~ParsedConfig() = default;
+
+protected:
+    explicit ParsedConfig(ConfigMode configMode = ConfigMode::DEFAULT_MODE);
+
+    void checkUnknownOptions(const std::map<std::string, std::string> &config) const;
+    virtual void checkInvalidValues(const std::map<std::string, std::string> &config) const;
+    std::unordered_set<std::string> getKnownOptions() const;
+
+    std::map<std::string, std::string> parse(const std::map<std::string, std::string> &config) {
+        checkInvalidValues(config);
+        checkUnknownOptions(config);
+        checkOptionsAccordingToMode(config);
+
+        auto defaultConfig = getDefaultConfig();
+        for (auto &&entry : config) {
+            defaultConfig[entry.first] = entry.second;
+        }
+
+        return defaultConfig;
+    }
+
+    void configure(const std::map<std::string, std::string> &config);
+    void checkSupportedValues(const std::unordered_map<std::string, std::unordered_set<std::string>> &supported,
+                              const std::map<std::string, std::string> &config) const;
+
+    virtual void checkOptionsAccordingToMode(const std::map<std::string, std::string> &config) const;
+    virtual std::unordered_set<std::string> getCompileOptions() const;
+    virtual std::unordered_set<std::string> getRuntimeOptions() const;
+
+private:
+    ConfigMode _mode = ConfigMode::DEFAULT_MODE;
+    Logger::Ptr _log;
+};
+
+template<typename T, typename V>
+inline void setOption(T &dst, const V &supported, const std::map<std::string, std::string> &config, const std::string &key) {
+    auto value = config.find(key);
+    if (value != config.end()) {
+        dst = supported.at(value->second);
+    }
+}
+
+inline void setOption(std::string &dst, const std::map<std::string, std::string> &config, const std::string &key) {
+    auto value = config.find(key);
+    if (value != config.end()) {
+        dst = value->second;
+    }
+}
+
+template<typename T, typename C>
+inline void setOption(T &dst, const std::map<std::string, std::string> &config, const std::string &key, const C &preprocess) {
+    auto value = config.find(key);
+    if (value != config.end()) {
+        dst = preprocess(value->second);
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/pass_manager.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/pass_manager.hpp
new file mode 100644 (file)
index 0000000..5d28e56
--- /dev/null
@@ -0,0 +1,204 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <utility>
+
+#include <vpu/model/model.hpp>
+#include <vpu/frontend/stage_builder.hpp>
+#include <vpu/backend/backend.hpp>
+
+namespace vpu {
+
+//
+// Pass
+//
+
+class Pass : public std::enable_shared_from_this<Pass> {
+public:
+    using Ptr = std::shared_ptr<Pass>;
+
+    virtual ~Pass() = default;
+
+    virtual void run(const Model::Ptr& model) = 0;
+};
+
+//
+// PerStagePass
+//
+
+class PerStagePass : public Pass {
+public:
+    explicit PerStagePass(std::initializer_list<StageType> types) : _types(types) {}
+
+    void run(const Model::Ptr& model) override;
+
+protected:
+    virtual void runForStage(const Model::Ptr& model, const Stage& stage) = 0;
+
+private:
+    EnumSet<StageType> _types;
+};
+
+//
+// PassSet
+//
+
+class PassSet final : public std::enable_shared_from_this<PassSet> {
+public:
+    using Ptr = std::shared_ptr<PassSet>;
+
+    void run(const Model::Ptr& model) const;
+
+    void addPass(const Pass::Ptr& pass) { _passes.emplace_back(pass); }
+    void addPass(Pass::Ptr&& pass) { _passes.emplace_back(std::move(pass)); }
+
+private:
+    std::vector<Pass::Ptr> _passes;
+};
+
+//
+// PassManager
+//
+
+class PassManager final : public std::enable_shared_from_this<PassManager> {
+public:
+    using Ptr = std::shared_ptr<PassManager>;
+
+    PassManager(
+            const StageBuilder::Ptr& stageBuilder,
+            const BackEnd::Ptr& backEnd) :
+            _stageBuilder(stageBuilder), _backEnd(backEnd) {
+    }
+
+    PassSet::Ptr buildMiddleEnd();
+
+public:
+    //
+    // Find SubGraphs for allocation
+    //
+
+    Pass::Ptr findSubGraphs();
+
+    //
+    // To overcome fp16 limitations
+    //
+
+    Pass::Ptr analyzeWeightableLayers();
+    Pass::Ptr estimateSingleNetworkScale();
+    Pass::Ptr propagateDataScale();
+
+    //
+    // Model common adaptation
+    //
+
+    Pass::Ptr splitGroupedConv();
+
+    //
+    // Model HW-specific optimizations
+    //
+
+    Pass::Ptr replaceFCbyConv();
+    Pass::Ptr replaceDeconvByConv();
+    Pass::Ptr swapConcatAndHwOps();
+    Pass::Ptr mergeHwStages();
+    Pass::Ptr splitHwDepthConv();
+    Pass::Ptr splitHwConvAndPool();
+    Pass::Ptr hwPadding();
+
+    //
+    // Batch support
+    //
+
+    Pass::Ptr adjustDataBatch();
+
+    //
+    // HW stages tiling
+    //
+
+    Pass::Ptr hwConvTiling();
+    Pass::Ptr hwPoolTiling();
+    Pass::Ptr hwFullyConnectedTiling();
+
+    //
+    // Model SW-specific adaptation
+    //
+
+    Pass::Ptr swConvAdaptation();
+    Pass::Ptr swDeconvAdaptation();
+    Pass::Ptr swPoolAdaptation();
+    Pass::Ptr swFullyConnectedAdaptation();
+
+    //
+    // Model SW-specific optimizations
+    //
+
+    Pass::Ptr mergeReLUAndBias();
+
+    //
+    // Data layout adjustment
+    //
+
+    Pass::Ptr adjustDataLayout();
+
+    //
+    // Model special stages processing
+    //
+
+    Pass::Ptr processSpecialStages();
+
+    //
+    // Data location adjustment
+    //
+
+    Pass::Ptr adjustDataLocation();
+
+    //
+    // Model common optimizations
+    //
+
+    Pass::Ptr eliminateCopyStages();
+
+    //
+    // HW/SW injection
+    //
+
+    Pass::Ptr injectSw();
+
+    //
+    // Final resource allocation
+    //
+
+    Pass::Ptr allocateResources();
+
+    //
+    // HW stages finalization
+    //
+
+    Pass::Ptr finalizeHwOps();
+
+    //
+    // Final check
+    //
+
+    Pass::Ptr finalCheck();
+
+    //
+    // Debug passes
+    //
+
+    Pass::Ptr dumpModel(const std::string& postfix);
+
+protected:
+    StageBuilder::Ptr _stageBuilder;
+    BackEnd::Ptr _backEnd;
+
+    int _dumpInd = 0;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/private_plugin_config.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/private_plugin_config.hpp
new file mode 100644 (file)
index 0000000..664ed21
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+#include <vpu/vpu_plugin_config.hpp>
+
+namespace InferenceEngine {
+namespace VPUConfigParams {
+
+//
+// Main options
+//
+
+DECLARE_VPU_CONFIG_KEY(NUMBER_OF_SHAVES);
+DECLARE_VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES);
+
+DECLARE_VPU_CONFIG_KEY(HW_ADAPTIVE_MODE);
+
+DECLARE_VPU_CONFIG_KEY(WATCHDOG);
+
+DECLARE_VPU_CONFIG_KEY(PERF_REPORT_MODE);
+DECLARE_VPU_CONFIG_VALUE(PER_LAYER);
+DECLARE_VPU_CONFIG_VALUE(PER_STAGE);
+
+//
+// Optimizations
+//
+
+DECLARE_VPU_CONFIG_KEY(COPY_OPTIMIZATION);
+DECLARE_VPU_CONFIG_KEY(HW_INJECT_STAGES);
+DECLARE_VPU_CONFIG_KEY(HW_POOL_CONV_MERGE);
+DECLARE_VPU_CONFIG_KEY(PACK_DATA_IN_CMX);
+
+//
+// Debug options
+//
+
+DECLARE_VPU_CONFIG_KEY(DETECT_NETWORK_BATCH);
+
+DECLARE_VPU_CONFIG_KEY(ALLOW_FP32_MODELS);
+
+DECLARE_VPU_CONFIG_KEY(HW_WHITE_LIST);
+DECLARE_VPU_CONFIG_KEY(HW_BLACK_LIST);
+
+DECLARE_VPU_CONFIG_KEY(NONE_LAYERS);
+
+DECLARE_VPU_CONFIG_KEY(IGNORE_UNKNOWN_LAYERS);
+
+}  // namespace VPUConfigParams
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/stub_stage.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/stub_stage.hpp
new file mode 100644 (file)
index 0000000..9b2a698
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/stage.hpp>
+
+namespace vpu {
+
+class StubStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override;
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override;
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override;
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override;
+
+    void finalizeDataLayoutImpl() override;
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override;
+
+    void finalCheckImpl() const override;
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override;
+
+    void serializeDataImpl(BlobSerializer& serializer) const override;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/sw/post_op_stage.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/sw/post_op_stage.hpp
new file mode 100644 (file)
index 0000000..7b2f26f
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/stage.hpp>
+
+namespace vpu {
+
+class PostOpStage : public StageNode {
+protected:
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override;
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override;
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override;
+
+    void finalizeDataLayoutImpl() override;
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override;
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override;
+
+    void finalCheckImpl() const override;
+
+    void serializeDataImpl(BlobSerializer& serializer) const override;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/sw/utility.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/sw/utility.hpp
new file mode 100644 (file)
index 0000000..049affa
--- /dev/null
@@ -0,0 +1,105 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+
+#include <ie_parallel.hpp>
+
+#include <vpu/model/data.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// Relayout
+//
+
+template <typename T>
+void kchw_to_hwck(const T* src, T* dst, const DataDesc& desc) {
+    IE_ASSERT(desc.numDims() >= 3);
+
+    auto W = desc.dim(Dim::W);
+    auto H = desc.dim(Dim::H);
+    auto C = desc.dim(Dim::C);
+
+    ie::parallel_for3d(W, H, C, [=](int w, int h, int c) {
+        auto inInd  = w + W * h + W * H * c;
+        auto outInd = c + C * h + C * H * w;
+        dst[outInd] = src[inInd];
+    });
+}
+
+template <typename T>
+void kchw_to_khwc(const T* src, T* dst, const DataDesc& desc) {
+    IE_ASSERT(desc.numDims() >= 3);
+
+    auto W = desc.dim(Dim::W);
+    auto H = desc.dim(Dim::H);
+    auto C = desc.dim(Dim::C);
+
+    ie::parallel_for3d(W, H, C, [=](int w, int h, int c) {
+        auto inInd  = w + W * h + W * H * c;
+        auto outInd = h + H * w + H * W * c;
+        dst[outInd] = src[inInd];
+    });
+}
+
+template <typename T>
+void kchw_to_hwkc(const T* src, T* dst, const DataDesc& desc) {
+    IE_ASSERT(desc.numDims() >= 3);
+
+    auto W = desc.dim(Dim::W);
+    auto H = desc.dim(Dim::H);
+    auto C = desc.dim(Dim::C);
+
+    ie::parallel_for3d(W, H, C, [=](int w, int h, int c) {
+        auto inInd  = w + W * h + W * H * c;
+        auto outInd = h + H * c + C * H * w;
+        dst[outInd] = src[inInd];
+    });
+}
+
+template <typename T>
+void deconv_to_conv(const T* src, T* dst, const DataDesc& desc) {
+    IE_ASSERT(desc.numDims() >= 4);
+
+    auto KX = desc.dim(Dim::W);
+    auto KY = desc.dim(Dim::H);
+    auto IC = desc.dim(Dim::C);
+    auto OC = desc.dim(Dim::N);
+
+    ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
+        auto inInd = kx + ky * KX + oc * KX * KY + ic * KX * KY * OC;
+        auto outInd = (KX - kx - 1) + (KY - ky - 1) * KX + ic * KX * KY + oc * KX * KY * IC;
+        dst[outInd] = src[inInd];
+    });
+}
+
+//
+// DefaultSwWeightsContent
+//
+
+class DefaultSwWeightsContent final : public CalculatedDataContent {
+public:
+    explicit DefaultSwWeightsContent(const DataContent::Ptr& origContent);
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
+};
+
+//
+// getNextStage
+//
+
+Stage getNextStage(
+        const Stage& curStage,
+        const std::unordered_set<StageType, EnumClassHash>& supportedTypes);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/any.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/any.hpp
new file mode 100644 (file)
index 0000000..368c817
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/dot_io.hpp>
+
+namespace vpu {
+
+class Any final {
+    struct Holder {
+        using Ptr = std::unique_ptr<Holder>;
+
+        virtual ~Holder() = default;
+
+        virtual Holder::Ptr clone() const = 0;
+
+        virtual void printImpl(std::ostream& os) const = 0;
+        virtual void printImpl(DotLabel& lbl) const = 0;
+    };
+
+    template <typename T>
+    struct HolderImpl final : Holder {
+        T val;
+
+        template <typename U>
+        explicit HolderImpl(U&& val) : val(std::forward<U>(val)) {}
+
+        Holder::Ptr clone() const override { return Holder::Ptr(new HolderImpl(val)); }
+
+        void printImpl(std::ostream& os) const override { printTo(os, val); }
+        void printImpl(DotLabel& lbl) const override { printTo(lbl, val); }
+    };
+
+public:
+    Any() = default;
+    Any(Any&&) = default;
+    Any& operator=(Any&&) = default;
+
+    template <typename T>
+    explicit Any(T&& arg) : _impl(new HolderImpl<typename std::decay<T>::type>(std::forward<T>(arg))) {}
+
+    Any(const Any& other) : _impl(other._impl != nullptr ? other._impl->clone() : nullptr) {}
+
+    Any& operator=(const Any& other) {
+        Any temp(other);
+        swap(temp);
+        return *this;
+    }
+
+    void swap(Any& other) {
+        std::swap(_impl, other._impl);
+    }
+
+    template <typename T>
+    const T& cast() const {
+        auto casted = dynamic_cast<const HolderImpl<typename std::decay<T>::type>*>(_impl.get());
+        IE_ASSERT(casted != nullptr);
+        return casted->val;
+    }
+
+    template <typename T>
+    T& cast() {
+        auto casted = dynamic_cast<HolderImpl<typename std::decay<T>::type>*>(_impl.get());
+        IE_ASSERT(casted != nullptr);
+        return casted->val;
+    }
+
+    void printImpl(std::ostream& os) const {
+        if (_impl != nullptr)
+            _impl->printImpl(os);
+    }
+
+    void printImpl(DotLabel& lbl) const {
+        if (_impl != nullptr)
+            _impl->printImpl(lbl);
+    }
+
+private:
+    Holder::Ptr _impl;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/attributes_map.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/attributes_map.hpp
new file mode 100644 (file)
index 0000000..faedb93
--- /dev/null
@@ -0,0 +1,122 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include <vpu/utils/any.hpp>
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/dot_io.hpp>
+
+namespace vpu {
+
+//
+// AttributesMap
+//
+
+class AttributesMap final {
+public:
+    using BaseMap = std::map<std::string, Any>;
+
+    using iterator = BaseMap::iterator;
+    using const_iterator = BaseMap::const_iterator;
+
+    AttributesMap() = default;
+    AttributesMap(const AttributesMap&) = default;
+    AttributesMap(AttributesMap&&) = default;
+    ~AttributesMap() = default;
+    AttributesMap& operator=(const AttributesMap&) = default;
+    AttributesMap& operator=(AttributesMap&&) = default;
+
+    bool empty() const { return _tbl.empty(); }
+
+    bool has(const std::string& name) const { return _tbl.count(name) != 0; }
+
+    template <typename T>
+    void set(const std::string& name, const T& val) { _tbl[name] = Any(val); }
+
+    void erase(const std::string& name) {  _tbl.erase(name); }
+
+    template <typename T>
+    const T& get(const std::string& name) const {
+        auto it = _tbl.find(name);
+        IE_ASSERT(it != _tbl.end());
+        return it->second.cast<T>();
+    }
+
+    template <typename T>
+    T& get(const std::string& name) {
+        auto it = _tbl.find(name);
+        IE_ASSERT(it != _tbl.end());
+        return it->second.cast<T>();
+    }
+
+    template <typename T>
+    const T& getOrDefault(const std::string& name, const T& def) const {
+        auto it = _tbl.find(name);
+        if (it != _tbl.end())
+            return it->second.cast<T>();
+        return def;
+    }
+
+    template <typename T>
+    T& getOrSet(const std::string& name, const T& def) {
+        auto it = _tbl.find(name);
+        if (it != _tbl.end())
+            return it->second.cast<T>();
+        set(name, def);
+        return get<T>(name);
+    }
+
+    iterator begin() { return _tbl.begin(); }
+    iterator end() { return _tbl.end(); }
+
+    const_iterator begin() const { return _tbl.begin(); }
+    const_iterator end() const { return _tbl.end(); }
+
+    const_iterator cbegin() const { return _tbl.cbegin(); }
+    const_iterator cend() const { return _tbl.cend(); }
+
+    void copyFrom(const AttributesMap& other) {
+        for (const auto& p : other._tbl) {
+            _tbl[p.first] = p.second;
+        }
+    }
+
+    void printImpl(std::ostream& os) const {
+        printTo(os, _tbl);
+    }
+
+    void printImpl(DotLabel& lbl) const {
+        printTo(lbl, _tbl);
+    }
+
+private:
+    BaseMap _tbl;
+};
+
+//
+// EnableCustomAttributes
+//
+
+class EnableCustomAttributes {
+public:
+    const AttributesMap& attrs() const { return _attrs; }
+    AttributesMap& attrs() { return _attrs; }
+
+protected:
+    EnableCustomAttributes() = default;
+    EnableCustomAttributes(const EnableCustomAttributes&) = default;
+    EnableCustomAttributes(EnableCustomAttributes&&) = default;
+    ~EnableCustomAttributes() = default;
+    EnableCustomAttributes& operator=(const EnableCustomAttributes&) = default;
+    EnableCustomAttributes& operator=(EnableCustomAttributes&&) = default;
+
+private:
+    AttributesMap _attrs;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/auto_scope.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/auto_scope.hpp
new file mode 100644 (file)
index 0000000..99ebe95
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+
+#include <details/ie_exception.hpp>
+
+namespace vpu {
+
+class AutoScope final {
+public:
+    explicit AutoScope(const std::function<void()>& func) : _func(func) {}
+
+    ~AutoScope() {
+        if (_func != nullptr) {
+            _func();
+        }
+    }
+
+    void callAndRelease() {
+        if (_func != nullptr) {
+            _func();
+            _func = nullptr;
+        }
+    }
+
+    void release() {
+        _func = nullptr;
+    }
+
+    AutoScope(const AutoScope& other) = delete;
+    AutoScope& operator=(const AutoScope&) = delete;
+
+private:
+    std::function<void()> _func;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/checked_cast.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/checked_cast.hpp
new file mode 100644 (file)
index 0000000..ccd444c
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#ifdef __INTEL_COMPILER
+#pragma warning disable: 54
+#endif
+
+#include <type_traits>
+#include <limits>
+
+#include <details/ie_exception.hpp>
+
+namespace vpu {
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_same<I, J>::value,
+    I>::type checked_cast(J value) {
+    return value;
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_integral<I>::value && std::is_integral<J>::value &&
+        std::is_signed<I>::value && std::is_signed<J>::value &&
+        !std::is_same<I, J>::value,
+    I>::type checked_cast(J value) {
+    IE_ASSERT(value >= std::numeric_limits<I>::lowest()) << value;
+    IE_ASSERT(value <= std::numeric_limits<I>::max()) << value;
+    return static_cast<I>(value);
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_integral<I>::value && std::is_integral<J>::value &&
+        std::is_signed<I>::value && std::is_unsigned<J>::value,
+    I>::type checked_cast(J value) {
+    IE_ASSERT(value <= static_cast<typename std::make_unsigned<I>::type>(std::numeric_limits<I>::max())) << value;
+    return static_cast<I>(value);
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_integral<I>::value && std::is_integral<J>::value &&
+        std::is_unsigned<I>::value && std::is_signed<J>::value,
+    I>::type checked_cast(J value) {
+    IE_ASSERT(value >= 0) << value;
+    // coverity[result_independent_of_operands]
+    IE_ASSERT(static_cast<typename std::make_unsigned<J>::type>(value) <= std::numeric_limits<I>::max()) << value;
+    return static_cast<I>(value);
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_integral<I>::value && std::is_integral<J>::value &&
+        std::is_unsigned<I>::value && std::is_unsigned<J>::value &&
+        !std::is_same<I, J>::value,
+    I>::type checked_cast(J value) {
+    // coverity[result_independent_of_operands]
+    IE_ASSERT(value <= std::numeric_limits<I>::max()) << value;
+    return static_cast<I>(value);
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_integral<I>::value && std::is_floating_point<J>::value,
+    I>::type checked_cast(J value) {
+    IE_ASSERT(value <= static_cast<J>(std::numeric_limits<I>::max())) << value;
+    IE_ASSERT(value >= static_cast<J>(std::numeric_limits<I>::lowest())) << value;
+    return static_cast<I>(value);
+}
+
+template <typename I, typename J>
+typename std::enable_if<
+        std::is_same<float, I>::value && std::is_same<double, J>::value,
+    I>::type checked_cast(J value) {
+    IE_ASSERT(static_cast<double>(static_cast<float>(value)) == value) << value;
+    return static_cast<I>(value);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/containers.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/containers.hpp
new file mode 100644 (file)
index 0000000..7c738ce
--- /dev/null
@@ -0,0 +1,915 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+
+#include <limits>
+#include <utility>
+#include <algorithm>
+#include <type_traits>
+#include <functional>
+#include <iterator>
+#include <vector>
+#include <list>
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <memory>
+#include <array>
+
+#include <vpu/utils/numeric.hpp>
+#include <vpu/utils/handle.hpp>
+
+namespace vpu {
+
+//
+// Small containers
+//
+
+namespace impl {
+
+class IntBufferBase {
+public:
+    virtual ~IntBufferBase() = default;
+
+    virtual void* getData() const = 0;
+    virtual int* getAvailable() const = 0;
+};
+
+template <typename T, size_t ExpandBytes, int Capacity>
+class SmallBufAllocator {
+    static_assert(Capacity > 0, "Capacity > 0");
+
+public:
+    struct ExpandedData final {
+        static constexpr const size_t FINAL_BYTE_SIZE = alignVal<alignof(size_t)>(sizeof(T) + ExpandBytes);
+
+        std::array<uint8_t, FINAL_BYTE_SIZE> data = {};
+
+        ExpandedData() = default;
+
+        ExpandedData(const ExpandedData&) = delete;
+        ExpandedData& operator=(const ExpandedData&) = delete;
+
+        ExpandedData(ExpandedData&&) = delete;
+        ExpandedData& operator=(ExpandedData&&) = delete;
+    };
+
+    class IntBuffer final : public IntBufferBase {
+    public:
+        IntBuffer() {
+            clear();
+        }
+
+        IntBuffer(const IntBuffer&) = delete;
+        IntBuffer& operator=(const IntBuffer&) = delete;
+
+        IntBuffer(IntBuffer&&) = delete;
+        IntBuffer& operator=(IntBuffer&&) = delete;
+
+        void clear() {
+            for (int i = 0; i < Capacity; ++i) {
+                _available[i] = Capacity - i;
+            }
+            _available[Capacity] = 0;
+        }
+
+        void* getData() const override {
+            return _data.data();
+        }
+        int* getAvailable() const override {
+            return _available.data();
+        }
+
+    private:
+        mutable std::array<ExpandedData, Capacity> _data = {};
+        mutable std::array<int, Capacity + 1> _available = {};
+    };
+
+public:
+    using value_type = T;
+
+    using pointer = T*;
+    using const_pointer = const T*;
+    using reference = T&;
+    using const_reference = const T&;
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using propagate_on_container_copy_assignment = std::false_type;
+    using propagate_on_container_move_assignment = std::false_type;
+    using propagate_on_container_swap = std::false_type;
+
+    template <typename T2> struct rebind {
+        static_assert(sizeof(ExpandedData) >= sizeof(T2), "sizeof(ExpandedData) >= sizeof(T2)");
+
+        typedef SmallBufAllocator<T2, sizeof(ExpandedData) - sizeof(T2), Capacity> other;
+    };
+
+    SmallBufAllocator() noexcept = delete;
+
+    SmallBufAllocator(const SmallBufAllocator&) noexcept = default;
+    SmallBufAllocator& operator=(const SmallBufAllocator&) noexcept = default;
+
+    SmallBufAllocator(SmallBufAllocator&& other) noexcept : _intBuf(other._intBuf) {
+        other._intBuf = nullptr;
+    }
+    SmallBufAllocator& operator=(SmallBufAllocator&& other) noexcept {
+        if (&other != this) {
+            _intBuf = other._intBuf;
+            other._intBuf = nullptr;
+        }
+        return *this;
+    }
+
+    explicit SmallBufAllocator(IntBuffer& intBuf) noexcept : _intBuf(&intBuf) {}
+
+    template <typename T2, size_t ExpandBytes2, int Capacity2>
+    SmallBufAllocator(const SmallBufAllocator<T2, ExpandBytes2, Capacity2>& other) noexcept : _intBuf(other._intBuf) {
+        static_assert(sizeof(ExpandedData) == sizeof(typename SmallBufAllocator<T2, ExpandBytes2, Capacity2>::ExpandedData),
+                      "sizeof(ExpandedData) == sizeof(typename SmallBufAllocator<T2, ExpandBytes2, Capacity2>::ExpandedData)");
+        static_assert(Capacity <= Capacity2, "Capacity <= Capacity2");
+    }
+
+    T* allocate(std::size_t n) {
+        assert(_intBuf != nullptr);
+
+        auto data = static_cast<ExpandedData*>(_intBuf->getData());
+        auto available = _intBuf->getAvailable();
+
+        if (n <= Capacity) {
+            int pos = -1;
+            int minAvailable = std::numeric_limits<int>::max();
+            for (int i = 0; i < Capacity; ++i) {
+                if (available[i] >= static_cast<int>(n) && available[i] < minAvailable) {
+                    pos = i;
+                    minAvailable = available[i];
+                }
+            }
+
+            if (pos >= 0) {
+                for (int i = pos - 1; (i >= 0) && available[i] > 0; --i) {
+                    assert(available[i] > available[pos]);
+                    available[i] -= available[pos];
+                }
+
+                std::fill_n(available + pos, n, 0);
+
+                return reinterpret_cast<T*>(data + pos);
+            }
+        }
+
+        return static_cast<T*>(::operator new (n * sizeof(T)));
+    }
+
+    void deallocate(T* ptr, std::size_t n) noexcept {
+        assert(_intBuf != nullptr);
+
+        auto data = static_cast<ExpandedData*>(_intBuf->getData());
+        auto available = _intBuf->getAvailable();
+
+        auto tempPtr = reinterpret_cast<ExpandedData*>(ptr);
+
+        if (tempPtr < data || tempPtr >= data + Capacity) {
+            ::operator delete(tempPtr);
+        } else {
+            auto pos = static_cast<int>(tempPtr - data);
+
+            for (int i = static_cast<int>(static_cast<std::size_t>(pos) + n - 1); i >= pos; --i) {
+                assert(available[i] == 0);
+                available[i] = available[i + 1] + 1;
+            }
+            for (int i = pos; (i >= 0) && available[i] > 0; --i) {
+                available[i] += available[i + 1];
+            }
+        }
+    }
+
+    T* allocate(std::size_t n, const void*) noexcept {
+        return allocate(n);
+    }
+
+    template <class U, class ...Args>
+    void construct(U* p, Args&& ...args) {
+        ::new(p) U(std::forward<Args>(args)...);
+    }
+
+    template <class U>
+    void destroy(U* p) noexcept {
+        p->~U();
+    }
+
+    std::size_t max_size() const noexcept {
+        return std::numeric_limits<std::size_t>::max() / sizeof(T);
+    }
+
+    const IntBufferBase* intBuf() const { return _intBuf; }
+
+private:
+    template <typename T2, size_t ExpandBytes2, int Capacity2>
+    friend class SmallBufAllocator;
+
+    const IntBufferBase* _intBuf = nullptr;
+};
+
+template <typename T1, size_t ExpandBytes1, int Capacity1, typename T2, size_t ExpandBytes2, int Capacity2>
+bool operator==(const SmallBufAllocator<T1, ExpandBytes1, Capacity1>& a1, const SmallBufAllocator<T2, ExpandBytes2, Capacity2>& a2) noexcept {
+    return a1.intBuf() == a2.intBuf();
+}
+template <typename T1, size_t ExpandBytes1, int Capacity1, typename T2, size_t ExpandBytes2, int Capacity2>
+bool operator!=(const SmallBufAllocator<T1, ExpandBytes1, Capacity1>& a1, const SmallBufAllocator<T2, ExpandBytes2, Capacity2>& a2) noexcept {
+    return a1.intBuf() != a2.intBuf();
+}
+
+}  // namespace impl
+
+template <typename T, int Capacity>
+class SmallVector {
+#if defined(_WIN32)
+    static constexpr const size_t ExpandBytes = 8;
+#else
+    static constexpr const size_t ExpandBytes = 0;
+#endif
+
+    using Alloc = impl::SmallBufAllocator<T, ExpandBytes, Capacity>;
+    using BaseCont = std::vector<T, Alloc>;
+
+public:
+    using value_type = typename BaseCont::value_type;
+
+    using iterator = typename BaseCont::iterator;
+    using const_iterator = typename BaseCont::const_iterator;
+
+    SmallVector() : _base(Alloc(_intBuf)) {
+        _base.reserve(Capacity);
+    }
+
+    ~SmallVector() = default;
+
+    explicit SmallVector(std::size_t count) : _base(count, Alloc(_intBuf)) {}
+    SmallVector(std::size_t count, const T& value) : _base(count, value, Alloc(_intBuf)) {}
+    SmallVector(std::initializer_list<T> init) : _base(init, Alloc(_intBuf)) {}
+
+    template <class InputIt>
+    SmallVector(InputIt first, InputIt last) : _base(first, last, Alloc(_intBuf)) {}
+
+    SmallVector(const SmallVector& other) :
+            _base(other._base, Alloc(_intBuf)) {
+    }
+    SmallVector& operator=(const SmallVector& other) {
+        if (&other != this) {
+            _base = other._base;
+        }
+        return *this;
+    }
+
+    template <typename T2, int Capacity2>
+    SmallVector(const SmallVector<T2, Capacity2>& other) :  // NOLINT
+            _base(other._base.begin(), other._base.end(), Alloc(_intBuf)) {
+    }
+    template <typename T2, int Capacity2>
+    SmallVector& operator=(const SmallVector<T2, Capacity2>& other) {
+        if (&other != this) {
+            _base.assign(other._base.begin(), other._base.end());
+        }
+        return *this;
+    }
+
+    template <class Alloc2>
+    SmallVector(const std::vector<T, Alloc2>& other) :  // NOLINT
+            _base(other.begin(), other.end(), Alloc(_intBuf)) {
+    }
+    template <class Alloc2>
+    SmallVector& operator=(const std::vector<T, Alloc2>& other) {
+        if (&other != this) {
+            _base.assign(other.begin(), other.end());
+        }
+        return *this;
+    }
+
+    operator const BaseCont&() {
+        return _base;
+    }
+    template <class Alloc2>
+    operator std::vector<T, Alloc2>() {
+        return std::vector<T, Alloc2>(_base.begin(), _base.end());
+    }
+
+    T& operator[](std::size_t pos) { return _base[pos]; }
+    const T& operator[](std::size_t pos) const { return _base[pos]; }
+
+    T& at(std::size_t pos) { return _base.at(pos); }
+    const T& at(std::size_t pos) const { return _base.at(pos); }
+
+    T& front() { return _base.front(); }
+    const T& front() const { return _base.front(); }
+    T& back() { return _base.back(); }
+    const T& back() const { return _base.back(); }
+
+    T* data() noexcept { return _base.data(); }
+    const T* data() const noexcept { return _base.data(); }
+
+    iterator begin() noexcept { return _base.begin(); }
+    iterator end() noexcept { return _base.end(); }
+    const_iterator begin() const noexcept { return _base.begin(); }
+    const_iterator end() const noexcept { return _base.end(); }
+    const_iterator cbegin() const noexcept { return _base.cbegin(); }
+    const_iterator cend() const noexcept { return _base.cend(); }
+
+    bool empty() const noexcept { return _base.empty(); }
+    std::size_t size() const noexcept { return _base.size(); }
+
+    void reserve(std::size_t cap) { _base.reserve(cap); }
+
+    void clear() noexcept { _base.clear(); }
+
+    void resize(std::size_t count) { _base.resize(count); }
+    void resize(std::size_t count, const T& value) { _base.resize(count, value); }
+
+    void push_back(const T& value) { _base.push_back(value); }
+    void push_back(T&& value) { _base.push_back(value); }
+
+    template <class... Args>
+    void emplace_back(Args&&... args) { _base.emplace_back(std::forward<Args>(args)...); }
+
+    void insert(iterator pos, const T& value) { _base.insert(pos, value); }
+    void insert(iterator pos, T&& value) { _base.insert(pos, value); }
+    void insert(iterator pos, std::size_t count, const T& value) { _base.insert(pos, count, value); }
+    template <class InputIt>
+    void insert(iterator pos, InputIt first, InputIt last) { _base.insert(pos, first, last); }
+    void insert(iterator pos, std::initializer_list<T> ilist) { _base.insert(pos, ilist); }
+
+    template <class... Args>
+    iterator emplace(iterator pos, Args&&... args) { return _base.emplace(pos, std::forward<Args>(args)...); }
+
+    void pop_back() { _base.pop_back(); }
+
+    iterator erase(iterator pos) { return _base.erase(pos); }
+    iterator erase(iterator first, iterator last) { return _base.erase(first, last); }
+
+    void swap(SmallVector& other) { std::swap(*this, other); }
+
+    bool operator==(const SmallVector& other) const { return _base == other._base; }
+    bool operator!=(const SmallVector& other) const { return _base != other._base; }
+    bool operator<(const SmallVector& other) const { return _base < other._base; }
+    bool operator<=(const SmallVector& other) const { return _base <= other._base; }
+    bool operator>(const SmallVector& other) const { return _base > other._base; }
+    bool operator>=(const SmallVector& other) const { return _base >= other._base; }
+
+private:
+    template <typename T2, int Capacity2>
+    friend class SmallVector;
+
+    typename Alloc::IntBuffer _intBuf;
+    BaseCont _base;
+};
+
+//
+// IntrusivePtrList
+//
+
+template <class Base>
+class IntrusivePtrListNode;
+
+template <class Base>
+class IntrusivePtrList;
+
+template <class Base>
+class IntrusivePtrListNode final {
+public:
+    inline explicit IntrusivePtrListNode(Base* owner) :
+            _owner(owner) {
+        assert(_owner != nullptr);
+    }
+
+    ~IntrusivePtrListNode();
+
+    IntrusivePtrListNode(const IntrusivePtrListNode&) = delete;
+    IntrusivePtrListNode& operator=(const IntrusivePtrListNode&) = delete;
+
+private:
+    inline Handle<Base> owner() const {
+        return _owner->handle_from_this();
+    }
+
+    inline bool belongTo(const IntrusivePtrList<Base>* list) const {
+        return _list == list;
+    }
+    inline void setList(IntrusivePtrList<Base>* list) {
+        assert(_list == nullptr);
+        _list = list;
+    }
+
+    inline bool hasIter(const typename IntrusivePtrList<Base>::Iterator* iter) const {
+        return _iter == iter;
+    }
+    inline void setIter(typename IntrusivePtrList<Base>::Iterator* iter) {
+        _iter = iter;
+    }
+
+    inline IntrusivePtrListNode* prevNode() const {
+        return _prev;
+    }
+    inline IntrusivePtrListNode* nextNode() const {
+        return _next;
+    }
+
+    void unlink();
+    void linkBefore(IntrusivePtrListNode& nextNode);
+    void linkAfter(IntrusivePtrListNode& prevNode);
+    void updateFront(IntrusivePtrListNode& frontNode);
+
+private:
+    Base* _owner = nullptr;
+
+    IntrusivePtrList<Base>* _list = nullptr;
+    typename IntrusivePtrList<Base>::Iterator* _iter = nullptr;
+
+    IntrusivePtrListNode* _prev = nullptr;
+    IntrusivePtrListNode* _next = nullptr;
+
+    friend class IntrusivePtrList<Base>;
+};
+
+template <class Base>
+class IntrusivePtrList final {
+public:
+    class Iterator final {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = Handle<Base>;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const Handle<Base>*;
+        using reference = const Handle<Base>&;
+
+        inline Iterator() = default;
+
+        inline Iterator(const Iterator&) = delete;
+        inline Iterator& operator=(const Iterator&) = delete;
+
+        inline explicit Iterator(IntrusivePtrListNode<Base> Base::* nodeField) :
+                _nodeField(nodeField) {
+            assert(_nodeField != nullptr);
+        }
+
+        Iterator(Iterator&& other);
+        Iterator& operator=(Iterator&& other);
+
+        Iterator(
+                const std::shared_ptr<Base>& cur,
+                IntrusivePtrListNode<Base> Base::* nodeField);
+
+        ~Iterator();
+
+        inline Handle<Base> operator*() const {
+            return Handle<Base>(_cur);
+        }
+
+        inline Iterator& operator++() {
+            if (!_skipNextAdvanced) {
+                advance();
+            }
+            _skipNextAdvanced = false;
+            return *this;
+        }
+
+        inline bool operator==(const Iterator& other) const { return _cur == other._cur; }
+        inline bool operator!=(const Iterator& other) const { return _cur != other._cur; }
+
+    private:
+        inline void itemUnlinked() {
+            advance();
+            _skipNextAdvanced = true;
+        }
+
+        void advance();
+
+    private:
+        IntrusivePtrListNode<Base> Base::* _nodeField = nullptr;
+        std::shared_ptr<Base> _cur;
+        bool _skipNextAdvanced = false;
+
+        friend class IntrusivePtrList;
+        friend class IntrusivePtrListNode<Base>;
+    };
+
+    using value_type = Handle<Base>;
+    using iterator = Iterator;
+    using const_iterator = Iterator;
+
+    inline explicit IntrusivePtrList(IntrusivePtrListNode<Base> Base::* nodeField) :
+            _nodeField(nodeField) {
+        assert(_nodeField != nullptr);
+    }
+
+    IntrusivePtrList(const IntrusivePtrList&) = delete;
+    IntrusivePtrList& operator=(const IntrusivePtrList&) = delete;
+
+    inline ~IntrusivePtrList() {
+        try {
+            clear();
+        }
+        catch (...) {
+            std::cerr << "ERROR ~IntrusivePtrList(): can not clear data\n";
+            std::abort();
+        }
+    }
+
+    inline Iterator begin() const { return Iterator(_front.lock(), _nodeField); }
+    inline Iterator end() const { return Iterator(_nodeField); }
+
+    inline Iterator cbegin() const { return Iterator(_front.lock(), _nodeField); }
+    inline Iterator cend() const { return Iterator(_nodeField); }
+
+    inline size_t size() const { return _size; }
+    inline bool empty() const { return _front == nullptr; }
+
+    inline void clear() {
+        while (!empty()) {
+            pop_front();
+        }
+    }
+
+    inline Handle<Base> front() const { return _front; }
+    inline Handle<Base> back() const { return _back; }
+
+    void push_back(const Handle<Base>& item);
+    void push_back(const std::shared_ptr<Base>& item);
+
+    void push_front(const Handle<Base>& item);
+    void push_front(const std::shared_ptr<Base>& item);
+
+    inline void erase(const Handle<Base>& item) {
+        erase(item.get());
+    }
+    inline void erase(const std::shared_ptr<Base>& item) {
+        erase(item.get());
+    }
+    inline void erase(const Iterator& it) {
+        erase(it._cur.get());
+    }
+
+    inline void pop_front() {
+        erase(_front);
+    }
+    inline void pop_back() {
+        erase(_back);
+    }
+
+    inline bool has(const Handle<Base>& item) const {
+        assert(!item.expired());
+
+        const auto& itemNode = item.getPlain()->*_nodeField;
+        return itemNode.belongTo(this);
+    }
+    inline bool has(const std::shared_ptr<Base>& item) const {
+        const auto& itemNode = item.get()->*_nodeField;
+        return itemNode.belongTo(this);
+    }
+
+private:
+    void erase(Base* item);
+
+private:
+    IntrusivePtrListNode<Base> Base::* _nodeField = nullptr;
+
+    Handle<Base> _front;
+    Handle<Base> _back;
+
+    size_t _size = 0;
+
+    friend class IntrusivePtrListNode<Base>;
+};
+
+//
+// Implementation
+//
+
+template <class Base>
+inline IntrusivePtrListNode<Base>::~IntrusivePtrListNode() {
+    try {
+        if (_list != nullptr) {
+            _list->erase(_owner);
+            _list = nullptr;
+        }
+    }
+    catch (...) {
+        std::cerr << "ERROR ~IntrusivePtrListNode(): can not clear data\n";
+        std::abort();
+    }
+}
+
+template <class Base>
+void IntrusivePtrListNode<Base>::unlink() {
+    assert(_list != nullptr);
+
+    if (_iter != nullptr) {
+        _iter->itemUnlinked();
+    }
+
+    if (_prev != nullptr) {
+        if (_prev->_next == this) {
+            _prev->_next = _next;
+        }
+    }
+
+    if (_next != nullptr) {
+        if (_next->_prev == this) {
+            _next->_prev = _prev;
+        }
+    }
+
+    _list = nullptr;
+    _iter = nullptr;
+    _prev = nullptr;
+    _next = nullptr;
+}
+
+template <class Base>
+void IntrusivePtrListNode<Base>::linkBefore(IntrusivePtrListNode& nextNode) {
+    assert(&nextNode != this);
+    assert(_list == nullptr);
+    assert(nextNode._list != nullptr);
+
+    _prev = nextNode._prev;
+    _next = &nextNode;
+    nextNode._prev = this;
+    if (_prev != nullptr) {
+        _prev->_next = this;
+    }
+
+    _list = nextNode._list;
+}
+
+template <class Base>
+void IntrusivePtrListNode<Base>::linkAfter(IntrusivePtrListNode& prevNode) {
+    assert(&prevNode != this);
+    assert(_list == nullptr);
+    assert(prevNode._list != nullptr);
+
+    _prev = &prevNode;
+    _next = prevNode._next;
+    prevNode._next = this;
+    if (_next != nullptr) {
+        _next->_prev = this;
+    }
+
+    _list = prevNode._list;
+}
+
+template <class Base>
+void IntrusivePtrListNode<Base>::updateFront(IntrusivePtrListNode& frontNode) {
+    assert(&frontNode != this);
+    assert(_list != nullptr);
+    assert(frontNode._list == _list);
+
+    _prev = &frontNode;
+    frontNode._next = this;
+}
+
+template <class Base>
+inline IntrusivePtrList<Base>::Iterator::Iterator(Iterator&& other) {
+    _nodeField = other._nodeField;
+    _cur = std::move(other._cur);
+    _skipNextAdvanced = other._skipNextAdvanced;
+
+    if (_cur != nullptr) {
+        assert(_nodeField != nullptr);
+
+        auto& curNode = _cur.get()->*_nodeField;
+
+        assert(curNode.hasIter(&other));
+        curNode.setIter(this);
+    }
+
+    other._nodeField = nullptr;
+    other._skipNextAdvanced = false;
+}
+
+template <class Base>
+inline typename IntrusivePtrList<Base>::Iterator& IntrusivePtrList<Base>::Iterator::operator=(Iterator&& other) {
+    if (this != &other) {
+        if (_cur != nullptr) {
+            auto& curNode = _cur.get()->*_nodeField;
+
+            assert(curNode.hasIter(this));
+            curNode.setIter(nullptr);
+        }
+
+        _nodeField = other._nodeField;
+        _cur = std::move(other._cur);
+        _skipNextAdvanced = other._skipNextAdvanced;
+
+        if (_cur != nullptr) {
+            assert(_nodeField != nullptr);
+
+            auto& curNode = _cur.get()->*_nodeField;
+
+            assert(curNode.hasIter(&other));
+            curNode.setIter(this);
+        }
+
+        other._nodeField = nullptr;
+        other._skipNextAdvanced = false;
+    }
+    return *this;
+}
+
+template <class Base>
+inline IntrusivePtrList<Base>::Iterator::Iterator(
+        const std::shared_ptr<Base>& cur,
+        IntrusivePtrListNode<Base> Base::* nodeField) :
+            _nodeField(nodeField),
+            _cur(cur) {
+    assert(_nodeField != nullptr);
+    if (_cur != nullptr) {
+        auto& curNode = _cur.get()->*_nodeField;
+
+        assert(curNode.hasIter(nullptr));
+        curNode.setIter(this);
+    }
+}
+
+template <class Base>
+inline IntrusivePtrList<Base>::Iterator::~Iterator() {
+    if (_cur != nullptr) {
+        auto& curNode = _cur.get()->*_nodeField;
+
+        assert(curNode.hasIter(this));
+        curNode.setIter(nullptr);
+    }
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::Iterator::advance() {
+    assert(_cur != nullptr);
+
+    auto& curNode = _cur.get()->*_nodeField;
+    assert(curNode.hasIter(this));
+
+    curNode.setIter(nullptr);
+
+    auto next = curNode.nextNode();
+    if (next == nullptr) {
+        _cur.reset();
+    } else {
+        auto nextOwner = next->owner();
+        assert(!nextOwner.expired());
+
+        auto& nextNode = nextOwner.get()->*_nodeField;
+        assert(nextNode.hasIter(nullptr));
+
+        nextNode.setIter(this);
+
+        _cur = nextOwner.lock();
+    }
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::push_back(const Handle<Base>& item) {
+    IE_ASSERT(!item.expired());
+
+    auto& itemNode = item.getPlain()->*_nodeField;
+
+    if (_back == nullptr) {
+        assert(_front == nullptr);
+
+        _front = _back = item;
+        itemNode.setList(this);
+    } else {
+        assert(_front != nullptr);
+
+        auto& backNode = _back.get()->*_nodeField;
+        itemNode.linkAfter(backNode);
+
+        if (_front == _back) {
+            itemNode.updateFront(backNode);
+        }
+
+        _back = item;
+    }
+
+    ++_size;
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::push_back(const std::shared_ptr<Base>& item) {
+    auto& itemNode = item.get()->*_nodeField;
+
+    if (_back == nullptr) {
+        assert(_front == nullptr);
+
+        _front = _back = item;
+        itemNode.setList(this);
+    } else {
+        assert(_front != nullptr);
+
+        auto& backNode = _back.get()->*_nodeField;
+        itemNode.linkAfter(backNode);
+
+        if (_front == _back) {
+            itemNode.updateFront(backNode);
+        }
+
+        _back = item;
+    }
+
+    ++_size;
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::push_front(const Handle<Base>& item) {
+    IE_ASSERT(!item.expired());
+
+    auto& itemNode = item.getPlain()->*_nodeField;
+
+    if (_front == nullptr) {
+        assert(_back == nullptr);
+
+        _front = _back = item;
+        itemNode.setList(this);
+    } else {
+        assert(_back != nullptr);
+
+        auto& frontNode = _front.get()->*_nodeField;
+        itemNode.linkBefore(frontNode);
+
+        _front = item;
+    }
+
+    ++_size;
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::push_front(const std::shared_ptr<Base>& item) {
+    auto& itemNode = item.get()->*_nodeField;
+
+    if (_front == nullptr) {
+        assert(_back == nullptr);
+
+        _front = _back = item;
+        itemNode.setList(this);
+    } else {
+        assert(_back != nullptr);
+
+        auto& frontNode = _front.get()->*_nodeField;
+        itemNode.linkBefore(frontNode);
+
+        _front = item;
+    }
+
+    ++_size;
+}
+
+template <class Base>
+void IntrusivePtrList<Base>::erase(Base* item) {
+    assert(item != nullptr);
+    if (item == nullptr) {
+        return;
+    }
+    assert(_size > 0);
+
+    auto& itemNode = item->*_nodeField;
+    assert(itemNode.belongTo(this));
+
+    if (_front.getPlain() == item) {
+        auto next = itemNode.nextNode();
+
+        if (next == nullptr) {
+            _front = nullptr;
+        } else {
+            assert(next->belongTo(this));
+
+            auto nextOwner = next->owner();
+            assert(!nextOwner.expired());
+
+            _front = nextOwner;
+        }
+    }
+    if (_back.getPlain() == item) {
+        auto prev = itemNode.prevNode();
+
+        if (prev == nullptr) {
+            _back = nullptr;
+        } else {
+            assert(prev->belongTo(this));
+
+            auto prevOwner = prev->owner();
+            assert(!prevOwner.expired());
+
+            _back = prevOwner;
+        }
+    }
+
+    itemNode.unlink();
+
+    --_size;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/dot_io.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/dot_io.hpp
new file mode 100644 (file)
index 0000000..33e1be3
--- /dev/null
@@ -0,0 +1,234 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include <unordered_set>
+#include <unordered_map>
+#include <utility>
+
+#include <ie_data.h>
+#include <ie_blob.h>
+#include <ie_layers.h>
+
+#include <details/ie_exception.hpp>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/io.hpp>
+#include <vpu/utils/containers.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// DotSerializer
+//
+
+class DotSerializer final {
+public:
+    class Ident final {
+    public:
+        explicit Ident(DotSerializer& out) : _out(out) {
+            ++_out._ident;
+        }
+
+        ~Ident() {
+            --_out._ident;
+        }
+
+    private:
+        DotSerializer& _out;
+    };
+
+public:
+    explicit DotSerializer(std::ostream& os) : _os(os) {}
+
+    DotSerializer(const DotSerializer& other) = delete;
+    DotSerializer& operator=(const DotSerializer&) = delete;
+
+    template <typename... Args>
+    void append(const char* format, const Args&... args) {
+        for (size_t i = 0; i < _ident; ++i)
+            _os << "    ";
+
+        formatPrint(_os, format, args...);
+
+        _os << std::endl;
+    }
+
+private:
+    std::ostream& _os;
+    size_t _ident = 0;
+
+    friend class Ident;
+};
+
+#define VPU_DOT_IDENT(dotOut) vpu::DotSerializer::Ident VPU_COMBINE(dotIdent, __LINE__) (dotOut)
+
+//
+// DotLabel
+//
+
+class DotLabel final {
+public:
+    DotLabel(const std::string& caption, DotSerializer& out);
+    explicit DotLabel(DotLabel& other);
+    ~DotLabel();
+
+    DotLabel(const DotLabel& other) = delete;
+    DotLabel& operator=(const DotLabel&) = delete;
+
+    template <typename K, typename V>
+    void appendPair(const K& key, const V& val);
+
+    template <typename... Args>
+    void appendValue(const char* format, const Args&... args);
+
+    void addIdent();
+
+private:
+    DotSerializer& _out;
+    DotLabel* _parent = nullptr;
+    size_t _ident = 0;
+    std::ostringstream _ostr;
+};
+
+//
+// printTo
+//
+
+template <typename T>
+void printTo(DotLabel& lbl, const T& val);
+
+template <typename T1, typename T2>
+void printTo(DotLabel& lbl, const std::pair<T1, T2>& p);
+
+template <typename T>
+void printTo(DotLabel& lbl, const std::vector<T>& cont);
+
+template <typename T>
+void printTo(DotLabel& lbl, const std::set<T>& cont);
+
+template <typename T, class H>
+void printTo(DotLabel& lbl, const std::unordered_set<T, H>& cont);
+
+template <typename K, typename V>
+void printTo(DotLabel& lbl, const std::map<K, V>& map);
+
+template <typename K, typename V, class H>
+void printTo(DotLabel& lbl, const std::unordered_map<K, V, H>& map);
+
+void printTo(DotLabel& lbl, const ie::DataPtr& ieData);
+
+void printTo(DotLabel& lbl, const ie::Blob::Ptr& ieBlob);
+
+void printTo(DotLabel& lbl, const ie::CNNLayerPtr& ieLayer);
+
+class Any;
+void printTo(DotLabel& lbl, const Any& any);
+
+class AttributesMap;
+void printTo(DotLabel& lbl, const AttributesMap& attrs);
+
+template <typename T, int Capacity>
+void printTo(DotLabel& lbl, const SmallVector<T, Capacity>& cont);
+
+//
+// Implementation
+//
+
+template <typename T>
+void printTo(DotLabel& lbl, const T& val) {
+    lbl.appendValue("%s", val);
+}
+
+template <typename T1, typename T2>
+void printTo(DotLabel& lbl, const std::pair<T1, T2>& p) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("first", p.first);
+    subLbl.appendPair("second", p.second);
+}
+
+template <class Cont>
+void printContainer(DotLabel& lbl, const Cont& cont) {
+    if (cont.size() > 4) {
+        DotLabel subLbl(lbl);
+
+        size_t ind = 0;
+        for (const auto& val : cont) {
+            subLbl.addIdent();
+            subLbl.appendValue("%s", val);
+            if (ind + 1 < cont.size()) {
+                subLbl.appendValue(",\\l");
+            }
+            if (ind > 8) {
+                subLbl.appendValue("...");
+                break;
+            }
+            ++ind;
+        }
+    } else {
+        lbl.appendValue("%s", cont);
+    }
+}
+
+template <typename T>
+void printTo(DotLabel& lbl, const std::vector<T>& cont) {
+    printContainer(lbl, cont);
+}
+
+template <typename T>
+void printTo(DotLabel& lbl, const std::set<T>& cont) {
+    printContainer(lbl, cont);
+}
+
+template <typename T, class H>
+void printTo(DotLabel& lbl, const std::unordered_set<T, H>& cont) {
+    printContainer(lbl, cont);
+}
+
+template <class Map>
+void printMap(DotLabel& lbl, const Map& map) {
+    DotLabel subLbl(lbl);
+    for (const auto& p : map) {
+        subLbl.appendPair(p.first, p.second);
+    }
+}
+
+template <typename K, typename V>
+void printTo(DotLabel& lbl, const std::map<K, V>& map) {
+    printMap(lbl, map);
+}
+
+template <typename K, typename V, class H>
+void printTo(DotLabel& lbl, const std::unordered_map<K, V, H>& map) {
+    printMap(lbl, map);
+}
+
+template <typename T, int Capacity>
+void printTo(DotLabel& lbl, const SmallVector<T, Capacity>& cont) {
+    printContainer(lbl, cont);
+}
+
+template <typename K, typename V>
+void DotLabel::appendPair(const K& key, const V& val) {
+    addIdent();
+    printTo(*this, key);
+    appendValue(" = ");
+    printTo(*this, val);
+    appendValue("\\l");
+}
+
+template <typename... Args>
+void DotLabel::appendValue(const char* format, const Args&... args) {
+    formatPrint(_ostr, format, args...);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/enums.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/enums.hpp
new file mode 100644 (file)
index 0000000..99f73d8
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <iostream>
+#include <sstream>
+
+#include <vpu/utils/checked_cast.hpp>
+
+namespace vpu {
+
+std::unordered_map<int32_t, std::string> generateEnumMap(const std::string& strMap);
+
+#define VPU_DECLARE_ENUM(EnumName, ...)                                         \
+    enum class EnumName : int32_t {                                             \
+        __VA_ARGS__                                                             \
+    };                                                                          \
+    inline std::ostream& operator<<(std::ostream& os, EnumName val) {           \
+        static const auto mapName = vpu::generateEnumMap(#__VA_ARGS__);         \
+        auto it = mapName.find(static_cast<int32_t>(val));                      \
+        if (it != mapName.end())                                                \
+            os << it->second;                                                   \
+        else                                                                    \
+            os << static_cast<int32_t>(val);                                    \
+        return os;                                                              \
+    }                                                                           \
+    template <typename I>                                                       \
+    inline I checked_cast(EnumName val) {                                       \
+        return vpu::checked_cast<I>(static_cast<int32_t>(val));                 \
+    }
+
+struct EnumClassHash final {
+    template <typename E>
+    size_t operator()(E t) const {
+        return std::hash<int32_t>()(static_cast<int32_t>(t));
+    }
+};
+
+template <typename E>
+using EnumSet = std::unordered_set<E, EnumClassHash>;
+
+template <typename E, typename V>
+using EnumMap = std::unordered_map<E, V, EnumClassHash>;
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/extra.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/extra.hpp
new file mode 100644 (file)
index 0000000..8a8bfba
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/ie_exception.hpp>
+#include <ie_profiling.hpp>
+
+namespace vpu {
+
+//
+// VPU_COMBINE
+//
+
+#define VPU_COMBINE_HELPER2(X, Y)  X##Y
+#define VPU_COMBINE_HELPER3(X, Y, Z)  X##Y##Z
+
+#define VPU_COMBINE(X, Y)   VPU_COMBINE_HELPER2(X, Y)
+#define VPU_COMBINE3(X, Y, Z)   VPU_COMBINE_HELPER3(X, Y, Z)
+
+//
+// Exceptions
+//
+
+#define VPU_THROW_EXCEPTION \
+    THROW_IE_EXCEPTION << "[VPU] "
+
+#define VPU_THROW_UNLESS(EXPRESSION) \
+    if (!(EXPRESSION)) VPU_THROW_EXCEPTION << "AssertionFailed: " << #EXPRESSION  // NOLINT
+
+//
+// Packed structure declaration
+//
+
+#ifdef _MSC_VER
+#   define VPU_PACKED(body) __pragma(pack(push, 1)) struct body __pragma(pack(pop))
+#elif defined(__GNUC__)
+#   define VPU_PACKED(body) struct __attribute__((packed)) body
+#endif
+
+//
+// Profiling
+//
+
+#define VPU_PROFILE(NAME) IE_PROFILING_AUTO_SCOPE(VPU_ ## NAME)
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/file_system.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/file_system.hpp
new file mode 100644 (file)
index 0000000..09de4d7
--- /dev/null
@@ -0,0 +1,13 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+namespace vpu {
+
+std::string fileNameNoExt(const std::string& filePath);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/func_ref.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/func_ref.hpp
new file mode 100644 (file)
index 0000000..58454f9
--- /dev/null
@@ -0,0 +1,49 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+namespace vpu {
+
+//
+// Non-owning alternative for std::function
+//
+
+template <typename> class FuncRef;
+
+template <typename R, typename... Args>
+class FuncRef<R(Args...)> {
+public:
+    template <class Func>
+    FuncRef(const Func& func) :
+            _realFuncPtr(&func),
+            _impl(&caller<typename std::remove_reference<Func>::type>) {
+        using actual_result_type = typename std::result_of<Func(Args...)>::type;
+        static_assert(
+            !std::is_reference<R>::value || std::is_reference<actual_result_type>::value,
+            "Mismatch between Func and FuncRef prototype");
+    }
+
+    R operator()(Args... args) const {
+        return _impl(_realFuncPtr, std::forward<Args>(args)...);
+    }
+
+private:
+    template <class Func>
+    static R caller(const void* realFuncPtr, Args... args) {
+        const auto& realFunc = *static_cast<const Func*>(realFuncPtr);
+        return realFunc(std::forward<Args>(args)...);
+    }
+
+private:
+    using ImplFunc = R(*)(const void*, Args...);
+
+    const void* _realFuncPtr = nullptr;
+    ImplFunc _impl = nullptr;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/handle.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/handle.hpp
new file mode 100644 (file)
index 0000000..574a26c
--- /dev/null
@@ -0,0 +1,153 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include <details/ie_exception.hpp>
+
+namespace vpu {
+
+template <typename T>
+class Handle final {
+public:
+    inline Handle() = default;
+
+    inline ~Handle() = default;
+
+    inline Handle(std::nullptr_t) {}  // NOLINT
+
+    template <typename U>
+    inline Handle(const std::shared_ptr<U>& ptr) : _weak(ptr), _plain(ptr.get()) {  // NOLINT
+        IE_ASSERT(_plain != nullptr);
+    }
+
+    template <typename U>
+    inline Handle(const Handle<U>& other) : _weak(other._weak), _plain(other._plain) {}  // NOLINT
+
+    inline Handle(const Handle&) = default;
+    inline Handle& operator=(const Handle&) = default;
+
+    inline Handle(Handle&& other) : _weak(std::move(other._weak)), _plain(other._plain) {
+        other._plain = nullptr;
+    }
+    inline Handle& operator=(Handle&& other) {
+        if (&other != this) {
+            _weak = std::move(other._weak);
+            _plain = other._plain;
+            other._plain = nullptr;
+        }
+        return *this;
+    }
+
+    inline Handle& operator=(std::nullptr_t) {
+        _weak.reset();
+        _plain = nullptr;
+        return *this;
+    }
+
+    inline std::shared_ptr<T> lock() const {
+        return _weak.lock();
+    }
+
+    inline bool expired() const {
+        return _weak.expired();
+    }
+
+    inline T* get() const {
+        return _weak.expired() ? nullptr : _plain;
+    }
+    inline T* getPlain() const {
+        return _plain;
+    }
+
+    inline T& operator*() const {
+        IE_ASSERT(!_weak.expired());
+        return *_plain;
+    }
+
+    inline T* operator->() const {
+        IE_ASSERT(!_weak.expired());
+        return _plain;
+    }
+
+    template <typename U>
+    inline Handle<U> dynamicCast() const {
+        if (auto newPtr = std::dynamic_pointer_cast<U>(_weak.lock())) {
+            return Handle<U>(newPtr);
+        }
+        return nullptr;
+    }
+
+    inline explicit operator bool() const {
+        return !_weak.expired();
+    }
+
+private:
+    std::weak_ptr<T> _weak;
+    T* _plain = nullptr;
+
+    template <typename U>
+    friend class Handle;
+};
+
+template <typename T>
+inline bool operator==(const Handle<T>& first, const Handle<T>& second) {
+    return first.get() == second.get();
+}
+template <typename T>
+inline bool operator!=(const Handle<T>& first, const Handle<T>& second) {
+    return first.get() != second.get();
+}
+template <typename T>
+inline bool operator==(const Handle<T>& first, const std::shared_ptr<T>& second) {
+    return first.get() == second.get();
+}
+template <typename T>
+inline bool operator!=(const Handle<T>& first, const std::shared_ptr<T>& second) {
+    return first.get() != second.get();
+}
+template <typename T>
+inline bool operator==(std::nullptr_t, const Handle<T>& h) {
+    return h.get() == nullptr;
+}
+template <typename T>
+inline bool operator==(const Handle<T>& h, std::nullptr_t) {
+    return h.get() == nullptr;
+}
+template <typename T>
+inline bool operator!=(std::nullptr_t, const Handle<T>& h) {
+    return h.get() != nullptr;
+}
+template <typename T>
+inline bool operator!=(const Handle<T>& h, std::nullptr_t) {
+    return h.get() != nullptr;
+}
+
+struct HandleHash final {
+    template <typename T>
+    inline size_t operator()(const Handle<T>& handle) const {
+        return std::hash<T*>()(handle.getPlain());
+    }
+};
+
+template <class Base>
+class EnableHandleFromThis : public std::enable_shared_from_this<Base> {
+public:
+    inline Handle<Base> handle_from_this() const {
+        return Handle<Base>(std::const_pointer_cast<Base>(this->shared_from_this()));
+    }
+
+protected:
+    inline EnableHandleFromThis() = default;
+    inline EnableHandleFromThis(const EnableHandleFromThis&) = default;
+    inline EnableHandleFromThis(EnableHandleFromThis&&) = default;
+    inline ~EnableHandleFromThis() = default;
+    inline EnableHandleFromThis& operator=(const EnableHandleFromThis&) = default;
+    inline EnableHandleFromThis& operator=(EnableHandleFromThis&&) = default;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/ie_helpers.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/ie_helpers.hpp
new file mode 100644 (file)
index 0000000..4b79798
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_blob.h>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in);
+
+ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in);
+ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in, ie::Layout outLayout);
+void copyBlob(const ie::Blob::Ptr& in, const ie::Blob::Ptr& out);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/io.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/io.hpp
new file mode 100644 (file)
index 0000000..1b222b4
--- /dev/null
@@ -0,0 +1,247 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <set>
+#include <map>
+#include <unordered_set>
+#include <unordered_map>
+#include <utility>
+#include <string>
+#include <array>
+
+#include <ie_data.h>
+#include <ie_blob.h>
+#include <ie_layers.h>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/containers.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// printTo
+//
+
+template <typename T>
+void printTo(std::ostream& os, const T& val) noexcept;
+
+template <typename T1, typename T2>
+void printTo(std::ostream& os, const std::pair<T1, T2>& p) noexcept;
+
+template <typename T>
+void printTo(std::ostream& os, const std::vector<T>& cont) noexcept;
+
+template <typename T, size_t COUNT>
+void printTo(std::ostream& os, const std::array<T, COUNT>& cont) noexcept;
+
+template <typename T>
+void printTo(std::ostream& os, const std::set<T>& cont) noexcept;
+
+template <typename T, class H>
+void printTo(std::ostream& os, const std::unordered_set<T, H>& cont) noexcept;
+
+template <typename K, typename V>
+void printTo(std::ostream& os, const std::map<K, V>& map) noexcept;
+
+template <typename K, typename V, class H>
+void printTo(std::ostream& os, const std::unordered_map<K, V, H>& map) noexcept;
+
+template <typename T, int Capacity>
+void printTo(std::ostream& os, const SmallVector<T, Capacity>& cont) noexcept;
+
+class Any;
+void printTo(std::ostream& os, const Any& any) noexcept;
+
+class AttributesMap;
+void printTo(std::ostream& os, const AttributesMap& attrs) noexcept;
+
+//
+// formatPrint
+//
+
+void formatPrint(std::ostream& os, const char* str) noexcept;
+
+template <typename T, typename... Args>
+void formatPrint(std::ostream& os, const char* str, const T& value, const Args&... args) noexcept;
+
+//
+// formatString
+//
+
+template <typename... Args>
+std::string formatString(const char* str, const Args&... args) noexcept;
+
+//
+// toString
+//
+
+template <typename T>
+std::string toString(const T& val) noexcept;
+
+//
+// Implementation
+//
+
+template <typename T>
+void printTo(std::ostream& os, const T& val) noexcept {
+    try {
+        os << val;
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error while printing\n";
+        std::abort();
+    }
+}
+
+template <typename T1, typename T2>
+void printTo(std::ostream& os, const std::pair<T1, T2>& p) noexcept {
+    try {
+        os << "[" << std::endl;
+
+        os << "first=";
+        printTo(os, p.first);
+        os << std::endl;
+
+        os << "second=";
+        printTo(os, p.second);
+        os << std::endl;
+
+        os << "]";
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error while printing\n";
+        std::abort();
+    }
+}
+
+template <class Cont>
+void printContainer(std::ostream& os, const Cont& cont) noexcept {
+    try {
+        os << "[";
+
+        size_t ind = 0;
+        for (const auto& val : cont) {
+            printTo(os, val);
+            if (ind + 1 < cont.size()) {
+                os << ", ";
+            }
+            if (ind > 8) {
+                os << "...";
+                break;
+            }
+            ++ind;
+        }
+
+        os << "]";
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error while printing\n";
+        std::abort();
+    }
+}
+
+template <typename T>
+void printTo(std::ostream& os, const std::vector<T>& cont) noexcept {
+    printContainer(os, cont);
+}
+
+template <typename T, size_t COUNT>
+void printTo(std::ostream& os, const std::array<T, COUNT>& cont) noexcept {
+    printContainer(os, cont);
+}
+
+template <typename T>
+void printTo(std::ostream& os, const std::set<T>& cont) noexcept {
+    printContainer(os, cont);
+}
+
+template <typename T, class H>
+void printTo(std::ostream& os, const std::unordered_set<T, H>& cont) noexcept {
+    printContainer(os, cont);
+}
+
+template <class Map>
+void printMap(std::ostream& os, const Map& map) noexcept {
+    try {
+        os << "[" << std::endl;
+
+        size_t ind = 0;
+        for (const auto& p : map) {
+            printTo(os, p.first);
+            os << "=";
+            printTo(os, p.second);
+            os << std::endl;
+            if (ind > 16) {
+                os << "...";
+                break;
+            }
+            ++ind;
+        }
+
+        os << "]";
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error while printing\n";
+        std::abort();
+    }
+}
+
+template <typename K, typename V>
+void printTo(std::ostream& os, const std::map<K, V>& map) noexcept {
+    printMap(os, map);
+}
+
+template <typename K, typename V, class H>
+void printTo(std::ostream& os, const std::unordered_map<K, V, H>& map) noexcept {
+    printMap(os, map);
+}
+
+template <typename T, int Capacity>
+void printTo(std::ostream& os, const SmallVector<T, Capacity>& cont) noexcept {
+    printContainer(os, cont);
+}
+
+template <typename T, typename... Args>
+void formatPrint(std::ostream& os, const char* str, const T& value, const Args&... args) noexcept {
+    try {
+        while (*str) {
+            if (*str == '%') {
+                if (*(str + 1) == '%') {
+                    ++str;
+                } else {
+                    printTo(os, value);
+                    formatPrint(os, str + 2, args...);
+                    return;
+                }
+            }
+
+            os << *str++;
+        }
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error while printing\n";
+        std::abort();
+    }
+
+    std::cerr << "[VPU] Extra arguments provided to formatPrint\n";
+    std::abort();
+}
+
+template <typename T>
+std::string toString(const T& val) noexcept {
+    std::ostringstream os;
+    printTo(os, val);
+    return os.str();
+}
+
+template <typename... Args>
+std::string formatString(const char* str, const Args&... args) noexcept {
+    std::ostringstream os;
+    formatPrint(os, str, args...);
+    return os.str();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/logger.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/logger.hpp
new file mode 100644 (file)
index 0000000..74870f2
--- /dev/null
@@ -0,0 +1,129 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <iosfwd>
+#include <string>
+#include <utility>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/enums.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/io.hpp>
+
+namespace vpu {
+
+//
+// OutputStream
+//
+
+class OutputStream {
+public:
+    using Ptr = std::shared_ptr<OutputStream>;
+
+    virtual ~OutputStream() = default;
+
+    virtual std::ostream& get() = 0;
+
+    virtual bool supportColors() const = 0;
+
+    virtual void lock() = 0;
+    virtual void unlock() = 0;
+};
+
+OutputStream::Ptr consoleOutput();
+OutputStream::Ptr fileOutput(const std::string& fileName);
+
+//
+// Logger
+//
+
+VPU_DECLARE_ENUM(LogLevel,
+    None,
+    Error,
+    Warning,
+    Info,
+    Debug
+)
+
+class Logger final {
+public:
+    using Ptr = std::shared_ptr<Logger>;
+
+    class Section final {
+    public:
+        explicit Section(const Logger::Ptr& log) : _log(log) {
+            IE_ASSERT(_log != nullptr);
+            ++_log->_ident;
+        }
+
+        ~Section() {
+            --_log->_ident;
+        }
+
+    private:
+        Logger::Ptr _log;
+    };
+
+public:
+    Logger(const std::string& name, LogLevel lvl, const OutputStream::Ptr& out) :
+            _name(name), _logLevel(lvl), _out(out) {
+        IE_ASSERT(_out != nullptr);
+    }
+
+    LogLevel level() const { return _logLevel; }
+
+    template <typename... Args>
+    void error(const char* format, const Args&... args) const noexcept {
+        addEntry(LogLevel::Error, format, args...);
+    }
+
+    template <typename... Args>
+    void warning(const char* format, const Args&... args) const noexcept {
+        addEntry(LogLevel::Warning, format, args...);
+    }
+
+    template <typename... Args>
+    void info(const char* format, const Args&... args) const noexcept {
+        addEntry(LogLevel::Info, format, args...);
+    }
+
+    template <typename... Args>
+    void debug(const char* format, const Args&... args) const  noexcept {
+        addEntry(LogLevel::Debug, format, args...);
+    }
+
+private:
+    template <typename... Args>
+    void addEntry(LogLevel msgLevel, const char* format, const Args&... args) const noexcept {
+        if (static_cast<int>(msgLevel) > static_cast<int>(_logLevel)) {
+            return;
+        }
+
+        _out->lock();
+        AutoScope scope([this] { _out->unlock(); });
+
+        printHeader(msgLevel);
+        formatPrint(_out->get(), format, args...);
+        printFooter();
+    }
+
+    void printHeader(LogLevel msgLevel) const noexcept;
+    void printFooter() const noexcept;
+
+private:
+    std::string _name;
+    LogLevel _logLevel = LogLevel::None;
+    OutputStream::Ptr _out;
+
+    size_t _ident = 0;
+
+    friend class Section;
+};
+
+#define VPU_LOGGER_SECTION(log) vpu::Logger::Section VPU_COMBINE(logSec, __LINE__) (log)
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/numeric.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/numeric.hpp
new file mode 100644 (file)
index 0000000..d63a327
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cmath>
+
+#include <type_traits>
+#include <limits>
+
+#include <details/ie_exception.hpp>
+
+namespace vpu {
+
+using fp16_t = short;
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+inline constexpr T isPowerOfTwo(T val) {
+    return (val > 0) && ((val & (val - 1)) == 0);
+}
+
+template <size_t align, typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+inline constexpr T alignVal(T val) {
+    static_assert(isPowerOfTwo(align), "isPowerOfTwo(align)");
+    return (val + (align - 1)) & ~(align - 1);
+}
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+inline T alignVal(T val, T align) {
+    IE_ASSERT(isPowerOfTwo(align));
+    return (val + (align - 1)) & ~(align - 1);
+}
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+inline T divUp(T a, T b) {
+    IE_ASSERT(b > 0);
+    return (a + b - 1) / b;
+}
+
+inline bool isFloatEqual(float a, float b) {
+    return std::fabs(a - b) <= std::numeric_limits<float>::epsilon();
+}
+inline bool isDoubleEqual(double a, double b) {
+    return std::fabs(a - b) <= std::numeric_limits<double>::epsilon();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/optional.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/optional.hpp
new file mode 100644 (file)
index 0000000..8b63d21
--- /dev/null
@@ -0,0 +1,191 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+#include <details/ie_exception.hpp>
+
+namespace vpu {
+
+template <typename T>
+class Optional final {
+public:
+    inline Optional() noexcept : _mem{}, _hasValue(false) {
+    }
+
+    inline ~Optional() {
+        reset();
+    }
+
+    inline Optional(const Optional& other) : _mem{}, _hasValue(false) {
+        if (other._hasValue) {
+            constructValue(other.getValueRef());
+            _hasValue = true;
+        }
+    }
+    inline Optional& operator=(const Optional& other) {
+        if (this != &other) {
+            if (other._hasValue) {
+                if (_hasValue) {
+                    getValueRef() = other.getValueRef();
+                } else {
+                    constructValue(other.getValueRef());
+                    _hasValue = true;
+                }
+            } else {
+                reset();
+            }
+        }
+        return *this;
+    }
+
+    inline Optional(Optional&& other) : _mem{}, _hasValue(false) {
+        if (other._hasValue) {
+            constructValue(other.getValueMoveRef());
+            _hasValue = true;
+        }
+    }
+    inline Optional& operator=(Optional&& other) {
+        if (this != &other) {
+            if (other._hasValue) {
+                if (_hasValue) {
+                    getValueRef() = other.getValueMoveRef();
+                } else {
+                    constructValue(other.getValueMoveRef());
+                    _hasValue = true;
+                }
+            } else {
+                reset();
+            }
+        }
+        return *this;
+    }
+
+    template <typename U>
+    inline Optional(const Optional<U>& other) : _mem{}, _hasValue(false) {
+        if (other._hasValue) {
+            constructValue(other.getValueRef());
+            _hasValue = true;
+        }
+    }
+    template <typename U>
+    inline Optional& operator=(const Optional<U>& other) {
+        if (this != &other) {
+            if (other._hasValue) {
+                if (_hasValue) {
+                    getValueRef() = other.getValueRef();
+                } else {
+                    constructValue(other.getValueRef());
+                    _hasValue = true;
+                }
+            } else {
+                reset();
+            }
+        }
+        return *this;
+    }
+
+    template <typename U>
+    inline Optional(Optional<U>&& other) : _mem{}, _hasValue(false) {
+        if (other._hasValue) {
+            constructValue(other.getValueMoveRef());
+            _hasValue = true;
+        }
+    }
+    template <typename U>
+    inline Optional& operator=(Optional<U>&& other) {
+        if (this != &other) {
+            if (other._hasValue) {
+                if (_hasValue) {
+                    getValueRef() = other.getValueMoveRef();
+                } else {
+                    constructValue(other.getValueMoveRef());
+                    _hasValue = true;
+                }
+            } else {
+                reset();
+            }
+        }
+        return *this;
+    }
+
+    template <typename U>
+    inline Optional(U&& value) : _mem{}, _hasValue(true) {  // NOLINT
+        constructValue(std::forward<U>(value));
+    }
+
+    template <typename U>
+    inline Optional& operator=(U&& value) {
+        if (_hasValue) {
+            getValueRef() = std::forward<U>(value);
+        } else {
+            constructValue(std::forward<U>(value));
+            _hasValue = true;
+        }
+        _hasValue = true;
+        return *this;
+    }
+
+    inline void reset() noexcept {
+        if (_hasValue) {
+            destroyValue();
+            _hasValue = false;
+        }
+    }
+
+    inline bool hasValue() const noexcept {
+        return _hasValue;
+    }
+
+    inline const T& get() const {
+        IE_ASSERT(_hasValue);
+        return getValueRef();
+    }
+
+    template <typename U>
+    inline T getOrDefault(U&& def) const {
+        if (_hasValue) {
+            return getValueRef();
+        } else {
+            return std::forward<U>(def);
+        }
+    }
+
+private:
+    using Memory = typename std::aligned_storage<sizeof(T), alignof(T)>::type[1];
+
+    inline T& getValueRef() {
+        return *reinterpret_cast<T*>(_mem);
+    }
+    inline const T& getValueRef() const {
+        return *reinterpret_cast<const T*>(_mem);
+    }
+
+    inline T&& getValueMoveRef() {
+        return std::move(getValueRef());
+    }
+
+    template <typename U>
+    inline void constructValue(U&& value) {
+        new (_mem) T(std::forward<U>(value));
+    }
+
+    inline void destroyValue() {
+        reinterpret_cast<T*>(_mem)->~T();
+    }
+
+private:
+    // TODO: actually, it would be better to initialize _mem here instead of doing it
+    // in each contructor but it causes a segfault in gcc 4.8
+    Memory _mem;
+    bool _hasValue = false;
+
+    template <typename U>
+    friend class Optional;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/perf_report.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/perf_report.hpp
new file mode 100644 (file)
index 0000000..f166a5c
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+
+#include <ie_common.h>
+
+#include <vpu/utils/enums.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+struct StageMetaInfo final {
+    ie::InferenceEngineProfileInfo::LayerStatus status = ie::InferenceEngineProfileInfo::LayerStatus::NOT_RUN;
+
+    std::string layerName;
+    std::string layerType;
+
+    std::string stageName;
+    std::string stageType;
+};
+
+VPU_DECLARE_ENUM(PerfReport,
+    PerLayer,
+    PerStage
+)
+
+std::map<std::string, ie::InferenceEngineProfileInfo> parsePerformanceReport(
+        const std::vector<StageMetaInfo>& stagesMeta,
+        const float* deviceTimings,
+        int deviceTimingsCount,
+        PerfReport perfReport,
+        bool printReceiveTensorTime);
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/range.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/range.hpp
new file mode 100644 (file)
index 0000000..4ca251c
--- /dev/null
@@ -0,0 +1,388 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cassert>
+
+#include <type_traits>
+#include <iterator>
+#include <utility>
+#include <vector>
+#include <memory>
+#include <functional>
+#include <unordered_set>
+
+#include <vpu/utils/containers.hpp>
+#include <vpu/utils/handle.hpp>
+#include <vpu/utils/optional.hpp>
+
+namespace vpu {
+
+//
+// IterRange
+//
+
+namespace impl {
+
+template <class Iterator>
+class IterRange final {
+public:
+    using value_type = typename Iterator::value_type;
+
+    using iterator = Iterator;
+    using const_iterator = Iterator;
+
+    inline IterRange() = default;
+    inline IterRange(const IterRange&) = default;
+    inline IterRange& operator=(const IterRange&) = default;
+    inline IterRange(IterRange&&) = default;
+    inline IterRange& operator=(IterRange&&) = default;
+
+    template <class It1, class It2>
+    inline IterRange(It1&& b, It2&& e) : _begin(std::forward<It1>(b)), _end(std::forward<It2>(e)) {}
+
+    inline Iterator begin() const { return _begin; }
+    inline Iterator end() const { return _end; }
+
+    Iterator cbegin() const { return _begin; }
+    Iterator cend() const { return _end; }
+
+private:
+    Iterator _begin;
+    Iterator _end;
+};
+
+}  // namespace impl
+
+template <class Iterator>
+inline impl::IterRange<Iterator> iterRange(const Iterator& begin, const Iterator& end) {
+    return impl::IterRange<Iterator>(begin, end);
+}
+template <class Iterator>
+inline impl::IterRange<Iterator> iterRange(Iterator&& begin, Iterator&& end) {
+    return impl::IterRange<Iterator>(std::move(begin), std::move(end));
+}
+
+//
+// ContRange
+//
+
+namespace impl {
+
+template <class Cont>
+class ContRange final {
+public:
+    using value_type = typename Cont::value_type;
+
+    using iterator = typename Cont::iterator;
+    using const_iterator = typename Cont::const_iterator;
+
+    inline ContRange() = default;
+    inline ContRange(const ContRange&) = default;
+    inline ContRange& operator=(const ContRange&) = default;
+    inline ContRange(ContRange&&) = default;
+    inline ContRange& operator=(ContRange&&) = default;
+
+    inline explicit ContRange(const Cont& cont) : _cont(&cont) {}
+
+    inline const_iterator begin() const {
+        assert(_cont != nullptr);
+        return _cont->begin();
+    }
+    inline const_iterator end() const {
+        assert(_cont != nullptr);
+        return _cont->end();
+    }
+
+    inline const_iterator cbegin() const {
+        assert(_cont != nullptr);
+        return _cont->begin();
+    }
+    inline const_iterator cend() const {
+        assert(_cont != nullptr);
+        return _cont->end();
+    }
+
+private:
+    const Cont* _cont = nullptr;
+};
+
+}  // namespace impl
+
+template <class Cont>
+inline impl::ContRange<Cont> contRange(const Cont& cont) {
+    return impl::ContRange<Cont>(cont);
+}
+
+//
+// MapRange
+//
+
+namespace impl {
+
+template <class BaseRange, class MapOp>
+class MapRange final {
+public:
+    class Iterator final {
+    public:
+        using base_iterator = typename BaseRange::const_iterator;
+        using base_iterator_value = decltype(*base_iterator());
+
+        using map_op_value = typename std::result_of<MapOp(base_iterator_value)>::type;
+
+        using value_type = typename std::decay<map_op_value>::type;
+        using pointer = value_type*;
+        using reference = value_type&;
+
+        using iterator_category = std::input_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+
+        inline Iterator() = default;
+        inline Iterator(const Iterator&) = default;
+        inline Iterator& operator=(const Iterator&) = default;
+        inline Iterator(Iterator&&) = default;
+        inline Iterator& operator=(Iterator&&) = default;
+
+        template <class BI>
+        inline Iterator(BI&& cur, BI&& end, const MapOp& op) :
+                _cur(std::forward<BI>(cur)),
+                _end(std::forward<BI>(end)),
+                _op(&op) {
+        }
+
+        inline const value_type& operator*() const {
+            assert(_cur != _end);
+            _curValue = (*_op)(*_cur);
+            return _curValue.get();
+        }
+
+        inline Iterator& operator++() {
+            ++_cur;
+            return *this;
+        }
+
+        inline bool operator==(const Iterator& other) const {
+            return _cur == other._cur;
+        }
+        inline bool operator!=(const Iterator& other) const {
+            return _cur != other._cur;
+        }
+
+    private:
+        base_iterator _cur;
+        base_iterator _end;
+        const MapOp* _op = nullptr;
+
+        mutable Optional<value_type> _curValue;
+    };
+
+    using base_iterator = typename BaseRange::const_iterator;
+    using base_iterator_value = decltype(*base_iterator());
+
+    using map_op_value = typename std::result_of<MapOp(base_iterator_value)>::type;
+
+    using value_type = typename std::decay<map_op_value>::type;
+
+    using iterator = Iterator;
+    using const_iterator = Iterator;
+
+    inline MapRange() = default;
+    inline MapRange(const MapRange&) = default;
+    inline MapRange& operator=(const MapRange&) = default;
+    inline MapRange(MapRange&&) = default;
+    inline MapRange& operator=(MapRange&&) = default;
+
+    template <class _B, class _M>
+    inline MapRange(_B&& base, _M&& op) :
+            _base(std::forward<_B>(base)),
+            _op(std::forward<_M>(op)) {
+    }
+
+    inline Iterator begin() const { return Iterator(_base.begin(), _base.end(), _op); }
+    inline Iterator end() const { return Iterator(_base.end(), _base.end(), _op); }
+
+    inline Iterator cbegin() const { return Iterator(_base.begin(), _base.end(), _op); }
+    inline Iterator cend() const { return Iterator(_base.end(), _base.end(), _op); }
+
+private:
+    BaseRange _base;
+    MapOp _op;
+};
+
+}  // namespace impl
+
+template <class BaseRange, class MapOp>
+inline impl::MapRange<typename std::decay<BaseRange>::type, typename std::decay<MapOp>::type>
+        mapRange(BaseRange&& base, MapOp&& op) {
+    return impl::MapRange<typename std::decay<BaseRange>::type, typename std::decay<MapOp>::type>(
+        std::forward<typename std::remove_reference<BaseRange>::type>(base),
+        std::forward<typename std::remove_reference<MapOp>::type>(op));
+}
+template <class MapOp, class BaseRange>
+inline impl::MapRange<typename std::decay<BaseRange>::type, MapOp> mapRange(BaseRange&& base) {
+    return impl::MapRange<typename std::remove_reference<BaseRange>::type, MapOp>(
+        std::forward<typename std::remove_reference<BaseRange>::type>(base),
+        MapOp());
+}
+
+//
+// FilterRange
+//
+
+namespace impl {
+
+template <class BaseRange, class FilterOp>
+class FilterRange final {
+public:
+    class Iterator final {
+    public:
+        using base_iterator = typename BaseRange::const_iterator;
+        using base_iterator_value = decltype(*base_iterator());
+
+        using value_type = typename std::decay<base_iterator_value>::type;
+        using pointer = value_type*;
+        using reference = value_type&;
+
+        using iterator_category = std::input_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+
+        inline Iterator() = default;
+        inline Iterator(const Iterator&) = default;
+        inline Iterator& operator=(const Iterator&) = default;
+        inline Iterator(Iterator&&) = default;
+        inline Iterator& operator=(Iterator&&) = default;
+
+        template <class BI>
+        inline Iterator(BI&& cur, BI&& end, const FilterOp& op) :
+                _cur(std::forward<BI>(cur)),
+                _end(std::forward<BI>(end)),
+                _op(&op) {
+            advance();
+        }
+
+        inline const value_type& operator*() const {
+            assert(_cur != _end);
+            _curValue = *_cur;
+            return _curValue.get();
+        }
+
+        inline Iterator& operator++() {
+            ++_cur;
+            advance();
+            return *this;
+        }
+
+        inline bool operator==(const Iterator& other) const {
+            return _cur == other._cur;
+        }
+        inline bool operator!=(const Iterator& other) const {
+            return _cur != other._cur;
+        }
+
+    private:
+        inline void advance() {
+            while (_cur != _end) {
+                _curValue = *_cur;
+                if ((*_op)(_curValue.get())) {
+                    break;
+                }
+                ++_cur;
+            }
+        }
+
+    private:
+        base_iterator _cur;
+        base_iterator _end;
+        const FilterOp* _op = nullptr;
+
+        mutable Optional<value_type> _curValue;
+    };
+
+    using value_type = typename BaseRange::value_type;
+
+    using iterator = Iterator;
+    using const_iterator = Iterator;
+
+    inline FilterRange() = default;
+    inline FilterRange(const FilterRange&) = default;
+    inline FilterRange& operator=(const FilterRange&) = default;
+    inline FilterRange(FilterRange&&) = default;
+    inline FilterRange& operator=(FilterRange&&) = default;
+
+    template <class _B, class _F>
+    inline FilterRange(_B&& base, _F&& op) :
+            _base(std::forward<_B>(base)),
+            _op(std::forward<_F>(op)) {
+    }
+
+    inline Iterator begin() const { return Iterator(_base.begin(), _base.end(), _op); }
+    inline Iterator end() const { return Iterator(_base.end(), _base.end(), _op); }
+
+    inline Iterator cbegin() const { return Iterator(_base.begin(), _base.end(), _op); }
+    inline Iterator cend() const { return Iterator(_base.end(), _base.end(), _op); }
+
+private:
+    BaseRange _base;
+    FilterOp _op;
+};
+
+}  // namespace impl
+
+template <class BaseRange, class FilterOp>
+inline impl::FilterRange<typename std::decay<BaseRange>::type, typename std::decay<FilterOp>::type>
+        filterRange(BaseRange&& base, FilterOp&& op) {
+    return impl::
+    FilterRange<typename std::decay<BaseRange>::type, typename std::decay<FilterOp>::type>(
+        std::forward<typename std::remove_reference<BaseRange>::type>(base),
+        std::forward<typename std::remove_reference<FilterOp>::type>(op));
+}
+template <class FilterOp, class BaseRange>
+inline impl::FilterRange<typename std::decay<BaseRange>::type, FilterOp> filterRange(BaseRange&& base) {
+    return impl::FilterRange<typename std::decay<BaseRange>::type, FilterOp>(
+        std::forward<typename std::remove_reference<BaseRange>::type>(base),
+        FilterOp());
+}
+
+struct NonNull final {
+public:
+    template <class Ptr>
+    inline bool operator()(const Ptr& ptr) const {
+        return ptr != nullptr;
+    }
+};
+
+struct PtrToHandle final {
+    template <typename T>
+    inline Handle<T> operator()(const std::shared_ptr<T>& ptr) const {
+        return Handle<T>(ptr);
+    }
+};
+
+//
+// toVector
+//
+
+template <class Range>
+inline std::vector<typename std::decay<typename Range::value_type>::type> toVector(const Range& range, int capacity = 0) {
+    std::vector<typename std::decay<typename Range::value_type>::type> out;
+    if (capacity > 0) {
+        out.reserve(capacity);
+    }
+    for (const auto& item : range) {
+        out.emplace_back(item);
+    }
+    return out;
+}
+
+template <int Capacity, class Range>
+inline SmallVector<typename std::decay<typename Range::value_type>::type, Capacity> toSmallVector(const Range& range) {
+    SmallVector<typename std::decay<typename Range::value_type>::type, Capacity> out;
+    for (const auto& item : range) {
+        out.emplace_back(item);
+    }
+    return out;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/simple_math.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/simple_math.hpp
new file mode 100644 (file)
index 0000000..01270e2
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <set>
+#include <map>
+#include <vector>
+#include <functional>
+#include <string>
+#include <utility>
+
+//
+// Simple integer arithmetics to be used for the work sizes calculation.
+// Supported operations : +,-,*,/,%,(,)
+// no unary -,+
+// Variables defined as single chars and should not include one of the ops, whitespaces or 0-9
+//
+
+namespace vpu {
+
+class SimpleMathExpression final {
+public:
+    void setVariables(const std::map<char, int>& vars) { _vars = vars; }
+
+    void parse(const std::string& expression);
+
+    int evaluate() const;
+
+private:
+    struct Token final {
+        enum TokenType {
+            Value,
+            Operator,
+        };
+
+        TokenType type;
+        int value;
+        char op;
+
+        explicit Token(TokenType t = Value, int v = 0, char o = 0) : type(t), value(v), op(o) {}
+    };
+
+private:
+    std::map<char, int> _vars;
+    std::vector<Token> _parsedTokens;
+};
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/utils/string.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/utils/string.hpp
new file mode 100644 (file)
index 0000000..b4a7af9
--- /dev/null
@@ -0,0 +1,66 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <unordered_set>
+#include <sstream>
+#include <utility>
+
+#include <details/caseless.hpp>
+
+#include <vpu/utils/containers.hpp>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+namespace impl {
+
+inline void insertToContainer(std::vector<std::string>& cont, std::string&& val) {
+    cont.emplace_back(val);
+}
+
+template <int Capacity>
+void insertToContainer(SmallVector<std::string, Capacity>& cont, std::string&& val) {
+    cont.emplace_back(val);
+}
+
+inline void insertToContainer(std::set<std::string>& cont, std::string&& val) {
+    cont.emplace(val);
+}
+
+inline void insertToContainer(std::unordered_set<std::string>& cont, std::string&& val) {
+    cont.emplace(val);
+}
+
+inline void insertToContainer(ie::details::caseless_set<std::string>& cont, std::string&& val) {
+    cont.emplace(val);
+}
+
+}  // namespace impl
+
+template <class Cont>
+void splitStringList(const std::string& str, Cont& out, char delim) {
+    out.clear();
+
+    if (str.empty())
+        return;
+
+    std::istringstream istr(str);
+
+    std::string elem;
+    while (std::getline(istr, elem, delim)) {
+        if (elem.empty()) {
+            continue;
+        }
+
+        impl::insertToContainer(out, std::move(elem));
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/allocator.cpp b/inference-engine/src/vpu/graph_transformer/src/allocator.cpp
new file mode 100644 (file)
index 0000000..8f07987
--- /dev/null
@@ -0,0 +1,652 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/allocator.hpp>
+
+#include <unordered_set>
+#include <algorithm>
+#include <limits>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+//
+// UsedMemory
+//
+
+void printTo(std::ostream& os, const UsedMemory& usedMemory) {
+    os << "[" << std::endl;
+
+    os << "BSS=" << usedMemory.BSS << std::endl;
+    os << "CMX=" << usedMemory.CMX << std::endl;
+    os << "blob=" << usedMemory.blob << std::endl;
+    os << "input=" << usedMemory.input << std::endl;
+    os << "output=" << usedMemory.output << std::endl;
+
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const UsedMemory& usedMemory) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("BSS", usedMemory.BSS);
+    subLbl.appendPair("CMX", usedMemory.CMX);
+    subLbl.appendPair("blob", usedMemory.blob);
+    subLbl.appendPair("input", usedMemory.input);
+    subLbl.appendPair("output", usedMemory.output);
+}
+
+//
+// Allocator
+//
+
+int calcAllocationSize(const Data& data) {
+    return alignVal(data->totalByteSize(), DATA_ALIGNMENT);
+}
+
+Allocator::Allocator(): _allocatorOfShaves(_cmxMemoryPool) {
+    const auto& env = CompileEnv::get();
+
+    _maxCmxSize = env.resources.numCMXSlices * CMX_SLICE_SIZE;
+
+    _memPools.emplace(MemoryType::DDR, &_ddrMemoryPool);
+    _memPools.emplace(MemoryType::CMX, &_cmxMemoryPool);
+}
+
+namespace {
+
+void updateChildDataAllocation(const Data& data, int offsetLimitation) {
+    for (const auto& edge : data->childDataEdges()) {
+        auto parent = edge->parent();
+        auto child = edge->child();
+
+        auto memoryOffset = parent->memoryOffset();
+
+        if (edge->mode() == SharedDataMode::ROI) {
+            auto parentStrides = parent->strides();
+            const auto& offset = edge->attrs().get<DimValues>("offset");
+
+            int byteOffset = 0;
+            for (const auto& p : offset) {
+                byteOffset += p.second * parentStrides[p.first];
+            }
+
+            memoryOffset += byteOffset;
+
+            IE_ASSERT(memoryOffset + child->lastElemOffset() <= offsetLimitation);
+        } else if (edge->mode() == SharedDataMode::Reshape) {
+            IE_ASSERT(parent->checkStrides(StridesRequirement::compact()));
+            IE_ASSERT(child->checkStrides(StridesRequirement::compact()));
+        } else {
+            IE_ASSERT(false) << "Unsupported enum value";
+        }
+
+        child->setAllocationInfo(parent->location(), memoryOffset);
+
+        updateChildDataAllocation(child, offsetLimitation);
+    }
+}
+
+}  // namespace
+
+bool Allocator::allocateData(const Data& data) {
+    //
+    // Get location requirements
+    //
+
+    auto memoryType = data->memReqs();
+
+    //
+    // Fake data: make sure no memory is allocated
+    //
+
+    if (data->usage() == DataUsage::Fake) {
+        if (_allocatedData.count(data) == 0) {
+            IE_ASSERT(data->parentDataEdge() == nullptr);
+
+            updateChildDataAllocation(data, 0);
+
+            _allocatedData.emplace(data);
+        }
+
+        return true;
+    }
+
+    //
+    // Input data
+    //
+
+    if (data->usage() == DataUsage::Input) {
+        if (_allocatedData.count(data) == 0) {
+            IE_ASSERT(data->parentDataEdge() == nullptr);
+            IE_ASSERT(data->checkStrides(StridesRequirement::compact()));
+
+            auto finalByteSize = alignVal(data->totalByteSize() * _modelBatchSize, DATA_ALIGNMENT);
+
+            data->setIOInfo(DataLocation::Input, _inputMemOffset);
+            _inputMemOffset += finalByteSize;
+
+            updateChildDataAllocation(data, DDR_MAX_SIZE);
+
+            _allocatedData.emplace(data);
+        }
+
+        return memoryType == MemoryType::DDR;
+    }
+
+    //
+    // Output data
+    //
+
+    if (data->usage() == DataUsage::Output) {
+        if (_allocatedData.count(data) == 0) {
+            IE_ASSERT(data->parentDataEdge() == nullptr);
+            IE_ASSERT(data->checkStrides(StridesRequirement::compact()));
+
+            int finalByteSize = 0;
+            if (data->attrs().getOrDefault<bool>("unbatched", false)) {
+                finalByteSize = data->totalByteSize();
+            } else {
+                finalByteSize = data->totalByteSize() * _modelBatchSize;
+            }
+            finalByteSize = alignVal(finalByteSize, DATA_ALIGNMENT);
+
+            data->setIOInfo(DataLocation::Output, _outputMemOffset);
+            _outputMemOffset += finalByteSize;
+
+            updateChildDataAllocation(data, DDR_MAX_SIZE);
+
+            _allocatedData.emplace(data);
+        }
+
+        return memoryType == MemoryType::DDR;
+    }
+
+    //
+    // Const data
+    //
+
+    if (data->usage() == DataUsage::Const) {
+        if (_allocatedData.count(data) == 0) {
+            IE_ASSERT(data->parentDataEdge() == nullptr);
+            IE_ASSERT(data->checkStrides(StridesRequirement::compact()));
+            IE_ASSERT(data->content() != nullptr);
+
+            auto finalByteSize = calcAllocationSize(data);
+
+            data->setAllocationInfo(DataLocation::Blob, _blobMemOffset);
+            _blobMemOffset += finalByteSize;
+
+            updateChildDataAllocation(data, DDR_MAX_SIZE);
+
+            _allocatedData.emplace(data);
+        }
+
+        return memoryType == MemoryType::DDR;
+    }
+
+    //
+    // Intermediate data must have producer and consumer(s)
+    //
+
+    if (data->usage() == DataUsage::Intermediate) {
+        IE_ASSERT(data->producerEdge() != nullptr);
+        IE_ASSERT(data->numConsumers() > 0);
+    }
+
+    //
+    // Allocate parent data if any
+    //
+
+    if (auto parentEdge = data->parentDataEdge()) {
+        auto parent = parentEdge->parent();
+
+        auto parentMemType = parent->memReqs();
+        IE_ASSERT(parentMemType == memoryType);
+
+        // Parent will update all children.
+        return allocateData(parent);
+    }
+
+    IE_ASSERT(data->parentDataEdge() == nullptr);
+
+    //
+    // Check if the data is already allocated
+    //
+
+    if (_allocatedIntermData.count(data) != 0) {
+        auto it = _memChunksPerData.find(data);
+        IE_ASSERT(it != _memChunksPerData.end());
+
+        auto chunk = it->second;
+        IE_ASSERT(chunk != nullptr);
+
+        return chunk->memType == memoryType;
+    }
+
+    //
+    // Calculate final buffer size
+    //
+
+    auto finalByteSize = calcAllocationSize(data);
+
+    //
+    // Allocate buffer in requested location
+    //
+
+    int inUse = 0;
+    if (data->usage() == DataUsage::Temp) {
+        inUse = 1;
+    } else {
+        loopOverData(data, [&inUse](const Data& subData) {
+            inUse += subData->numConsumers();
+            return DataLoopStatus::NextChild;
+        });
+    }
+    IE_ASSERT(inUse >= 1);
+
+    auto chunk = allocateMem(memoryType, finalByteSize, inUse);
+
+    if (chunk == nullptr) {
+        return false;
+    }
+
+    //
+    // Update data allocation info
+    //
+
+    data->setAllocationInfo(chunk->memType == MemoryType::CMX ? DataLocation::CMX : DataLocation::BSS, chunk->pointer);
+
+    auto offsetLimitation = (data->location() == DataLocation::CMX) ? _maxCmxSize : DDR_MAX_SIZE;
+    updateChildDataAllocation(data, offsetLimitation);
+
+    _memChunksPerData.emplace(data, chunk);
+    _allocatedIntermData.emplace(data);
+
+    return chunk->memType == memoryType;
+}
+
+void Allocator::freeData(const Data& data, DeallocationMode mode) {
+    //
+    // Release the chunk
+    //
+
+    auto topParent = data->getTopParentData();
+
+    if (topParent->usage() == DataUsage::Intermediate ||
+        topParent->usage() == DataUsage::Temp) {
+        IE_ASSERT(_allocatedIntermData.count(topParent) > 0);
+
+        auto it = _memChunksPerData.find(topParent);
+        IE_ASSERT(it != _memChunksPerData.end());
+
+        auto chunk = it->second;
+        IE_ASSERT(chunk != nullptr);
+        IE_ASSERT(chunk->inUse > 0);
+
+        switch (mode) {
+        case DeallocationMode::JustFree: {
+            --chunk->inUse;
+
+            if (chunk->inUse == 0) {
+                freeMem(chunk);
+
+                _memChunksPerData.erase(topParent);
+                _allocatedIntermData.erase(topParent);
+            }
+
+            break;
+        }
+
+        case DeallocationMode::MoveFromCMX: {
+            IE_ASSERT(chunk->memType == MemoryType::CMX);
+
+            auto curChunkSz = chunk->size;
+            auto inUse = chunk->inUse;
+
+            freeMem(chunk);
+
+            auto ddrChunk = allocateMem(MemoryType::DDR, curChunkSz, inUse);
+            IE_ASSERT(ddrChunk!= nullptr);
+
+            _memChunksPerData[data] = ddrChunk;
+
+            data->setAllocationInfo(DataLocation::BSS, ddrChunk->pointer);
+            updateChildDataAllocation(data, DDR_MAX_SIZE);
+
+            break;
+        }
+
+        default:
+            VPU_THROW_EXCEPTION << "Unsupported mode : " << mode;
+        }
+    }
+}
+
+void Allocator::selfCheck() {
+    _allocatorOfShaves.selfCheck();
+
+    for (const auto& p : _memPools) {
+        if (!p.second->freePool.empty() || p.second->curMemOffset > 0) {
+            VPU_THROW_EXCEPTION << "Internal error in " << p.first << " allocation";
+        }
+    }
+}
+
+UsedMemory Allocator::usedMemory() const {
+    UsedMemory stats;
+
+    stats.BSS = _ddrMemoryPool.memUsed;
+    stats.CMX = _cmxMemoryPool.memUsed;
+    stats.blob = _blobMemOffset;
+    stats.input = _inputMemOffset;
+    stats.output = _outputMemOffset;
+
+    return stats;
+}
+
+void Allocator::extractDatas(MemoryType memType, const DataSet& from, DataVector& out) const {
+    for (const auto& data : from) {
+        if (data->usage() != DataUsage::Intermediate)
+            continue;
+
+        auto it = _memChunksPerData.find(data);
+        IE_ASSERT(it != _memChunksPerData.end());
+
+        auto chunk = it->second;
+        IE_ASSERT(chunk != nullptr);
+        IE_ASSERT(chunk->inUse > 0);
+
+        if (chunk->memType == memType) {
+            out.emplace_back(data);
+        }
+    }
+}
+
+DataVector Allocator::getAllocatedDatas(MemoryType memType) const {
+    DataVector out;
+
+    if (memType == MemoryType::CMX) {
+        out.reserve(_allocatedIntermData.size());
+        extractDatas(memType, _allocatedIntermData, out);
+    } else {
+        out.reserve(_allocatedData.size() + _allocatedIntermData.size());
+        extractDatas(memType, _allocatedData, out);
+        extractDatas(memType, _allocatedIntermData, out);
+    }
+
+    return out;
+}
+
+allocator::MemChunk* Allocator::allocateMem(MemoryType memType, int size, int inUse) {
+    auto& memPool =  _memPools.at(memType);
+
+    //
+    // Try to reuse already allocated memory
+    //
+
+    if (auto chunk = checkMemPool(*memPool, memType, size, inUse)) {
+        memPool->memUsed = std::max(memPool->memUsed, chunk->offset + chunk->size);
+        return chunk;
+    }
+
+    //
+    // Check free space
+    //
+
+    int freeSpace = 0;
+    if (memType == MemoryType::CMX) {
+        auto shavesCMX = _allocatorOfShaves.getLockedSHAVEs() * CMX_SLICE_SIZE;
+
+        IE_ASSERT(memPool->curMemOffset + shavesCMX <= _maxCmxSize);
+
+        freeSpace = _maxCmxSize - (memPool->curMemOffset + shavesCMX);
+    } else {
+        IE_ASSERT(memPool->curMemOffset <= DDR_MAX_SIZE);
+
+        freeSpace = DDR_MAX_SIZE - memPool->curMemOffset;
+    }
+
+    if (size > freeSpace) {
+        return nullptr;
+    }
+
+    //
+    // Allocate new chunk
+    //
+
+    int pointer = 0;
+    if (memType == MemoryType::CMX) {
+        IE_ASSERT(memPool->curMemOffset + size <= _maxCmxSize);
+        pointer = _maxCmxSize - (memPool->curMemOffset + size);
+    } else {
+        pointer = memPool->curMemOffset;
+    }
+
+    auto chunk = addNewChunk(*memPool, memType, memPool->curMemOffset, pointer, size, inUse);
+    IE_ASSERT(chunk != nullptr);
+
+    memPool->curMemOffset += size;
+
+    memPool->memUsed = std::max(memPool->memUsed, chunk->offset + chunk->size);
+
+    return chunk;
+}
+
+void Allocator::freeMem(allocator::MemChunk* chunk) {
+    IE_ASSERT(chunk != nullptr);
+
+    auto& memPool =  _memPools.at(chunk->memType);
+
+    allocator::FreeMemory newMem;
+    newMem.offset = chunk->offset;
+    newMem.size = chunk->size;
+
+    while (true) {
+        bool found = false;
+
+        for (auto memPoolIt = memPool->freePool.begin(); memPoolIt != memPool->freePool.end(); ++memPoolIt) {
+            IE_ASSERT(newMem.offset != memPoolIt->offset);
+
+            if (newMem.offset + newMem.size == memPoolIt->offset) {
+                //
+                // [newMem][*memPoolIt] case
+                // extend newMem to and remove memPoolIt
+                //
+
+                newMem.size += memPoolIt->size;
+
+                memPool->freePool.erase(memPoolIt);
+
+                found = true;
+                break;
+            } else if (memPoolIt->offset + memPoolIt->size == newMem.offset) {
+                //
+                // [*memPoolIt][newMem] case
+                // extend newMem to and remove memPoolIt
+                //
+
+                newMem.offset = memPoolIt->offset;
+                newMem.size += memPoolIt->size;
+
+                memPool->freePool.erase(memPoolIt);
+
+                found = true;
+                break;
+            }
+        }
+
+        if (!found) {
+            if (newMem.offset + newMem.size == memPool->curMemOffset) {
+                memPool->curMemOffset = newMem.offset;
+            } else {
+                memPool->freePool.emplace_back(newMem);
+            }
+
+            break;
+        }
+    }
+
+    IE_ASSERT(chunk->_posInList != memPool->allocatedChunks.end());
+    memPool->allocatedChunks.erase(chunk->_posInList);
+}
+
+allocator::MemChunk* Allocator::addNewChunk(allocator::MemoryPool& memPool, MemoryType memType, int offset, int pointer, int size, int inUse) {
+    allocator::MemChunk newChunkValues;
+    newChunkValues.memType = memType;
+    newChunkValues.pointer = pointer;
+    newChunkValues.offset = offset;
+    newChunkValues.size = size;
+    newChunkValues.inUse = inUse;
+    auto it = memPool.allocatedChunks.emplace(memPool.allocatedChunks.end(), newChunkValues);
+
+    auto newChunk = &memPool.allocatedChunks.back();
+    newChunk->_posInList = it;
+
+    return newChunk;
+}
+
+allocator::MemChunk* Allocator::checkMemPool(allocator::MemoryPool& memPool, MemoryType memType, int size, int inUse) {
+    auto minMemSizeToUse = std::numeric_limits<size_t>::max();
+    auto minMemIt = memPool.freePool.end();
+
+    for (auto memPoolIt = memPool.freePool.begin(); memPoolIt != memPool.freePool.end(); ++memPoolIt) {
+        if (memPoolIt->size >= size) {
+            if (memPoolIt->size < minMemSizeToUse) {
+                minMemSizeToUse = memPoolIt->size;
+                minMemIt = memPoolIt;
+            }
+        }
+    }
+
+    if (minMemIt == memPool.freePool.end()) {
+        return nullptr;
+    }
+
+    auto offset = minMemIt->offset + minMemIt->size - size;
+
+    int pointer = 0;
+    if (memType == MemoryType::DDR) {
+        pointer = offset;
+    } else {
+        IE_ASSERT(offset + size <= _maxCmxSize);
+        pointer = _maxCmxSize - offset - size;
+    }
+
+    auto chunk = addNewChunk(memPool, memType, offset, pointer, size, inUse);
+
+    minMemIt->size -= size;
+
+    if (minMemIt->size == 0) {
+        memPool.freePool.erase(minMemIt);
+    }
+
+    return chunk;
+}
+
+void Allocator::reset() {
+    const auto& env = CompileEnv::get();
+
+    _maxCmxSize = env.resources.numCMXSlices * CMX_SLICE_SIZE;
+    _allocatorOfShaves.reset();
+
+    for (auto& pool : _memPools) {
+        pool.second->clear();
+    }
+
+    _allocatedIntermData.clear();
+
+    _memChunksPerData.clear();
+}
+
+AllocationResult Allocator::preprocess(const ModelPtr& model) {
+    reset();
+
+    if (_needToAllocNonIntermData) {
+        _allocatedData.clear();
+        _allocatedData.reserve(model->numDatas());
+
+        _blobMemOffset = 0;
+        _inputMemOffset = 0;
+        _outputMemOffset = 0;
+
+        for (const auto& data : model->datas()) {
+            data->clearAllocation();
+        }
+
+        for (const auto& data : model->datas()) {
+            if (data->usage() != DataUsage::Intermediate &&
+                data->usage() != DataUsage::Temp) {
+                if (!allocateData(data)) {
+                    AllocationResult result;
+                    result.status = AllocationStatus::DATA_FAILED;
+                    result.failedStage = data->producer();
+                    return result;
+                }
+            }
+        }
+    }
+
+    _needToAllocNonIntermData = false;
+
+    return AllocationResult();
+}
+
+bool Allocator::removeCMXCandidates(const vpu::Data& data) {
+    auto it = _candidatesForCMX.find(data);
+
+    if (it != _candidatesForCMX.end()) {
+        IE_ASSERT(data->parentDataEdge() == nullptr);
+
+        if (_allocatedIntermData.count(data) != 0) {
+            if (auto producerEdge = data->producerEdge()) {
+                if (producerEdge->portInd() == 0 &&
+                    producerEdge->producer()->category() == StageCategory::HW) {
+                    return true;
+                }
+            }
+
+            freeData(data, DeallocationMode::MoveFromCMX);
+        }
+
+        loopOverData(data, [](const Data& subData) {
+            subData->setMemReqs(MemoryType::DDR);
+            return DataLoopStatus::NextChild;
+        });
+
+        _candidatesForCMX.erase(it);
+
+        return true;
+    } else {
+        auto cmxDatas = getAllocatedDatas(MemoryType::CMX);
+
+        for (const auto& cmxData : cmxDatas) {
+            IE_ASSERT(cmxData->parentDataEdge() == nullptr);
+
+            it = _candidatesForCMX.find(cmxData);
+
+            if (it != _candidatesForCMX.end()) {
+                freeData(cmxData, DeallocationMode::MoveFromCMX);
+
+                loopOverData(cmxData, [](const Data& subData) {
+                    subData->setMemReqs(MemoryType::DDR);
+                    return DataLoopStatus::NextChild;
+                });
+
+                _candidatesForCMX.erase(it);
+
+                // TODO: remove the first CMX candidate or remove all CMX candidates?
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/allocator_shaves.cpp b/inference-engine/src/vpu/graph_transformer/src/allocator_shaves.cpp
new file mode 100644 (file)
index 0000000..d6c85a8
--- /dev/null
@@ -0,0 +1,118 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/allocator_shaves.hpp>
+
+#include <unordered_set>
+#include <algorithm>
+#include <limits>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+// TODO: investigate the value
+    const int SHAVES_LIMITATION_FOR_HW = 2;
+
+}  // namespace
+
+AllocatorForShaves::AllocatorForShaves(allocator::MemoryPool &cmxMemoryPool): _cmxMemoryPool(cmxMemoryPool) {
+}
+
+bool AllocatorForShaves::allocateSHAVEs(
+        const Stage& stage,
+        StageSHAVEsRequirements reqs) {
+    const auto& env = CompileEnv::get();
+
+    //
+    // Check that we don't allocate twice
+    //
+
+    if (_lockedSHAVEs != 0) {
+        VPU_THROW_EXCEPTION << "Can't allocate SHAVEs : was already allocated";
+    }
+
+    //
+    // Check stage requirements
+    //
+
+    if (reqs == StageSHAVEsRequirements::NotNeeded) {
+        // Stage doesn't need SHAVEs.
+        return true;
+    }
+
+    //
+    // Check the amount of free SHAVEs
+    //
+
+    auto usedCMXslices = (_cmxMemoryPool.curMemOffset + CMX_SLICE_SIZE - 1) / CMX_SLICE_SIZE;
+    IE_ASSERT(usedCMXslices <= env.resources.numCMXSlices);
+
+    const auto numAvailableSHAVEs = std::min(env.resources.numCMXSlices - usedCMXslices, env.resources.numSHAVEs);
+    if (numAvailableSHAVEs == 0) {
+        return false;
+    }
+
+    int necessarySHAVEsNum = numAvailableSHAVEs;
+    if (reqs == StageSHAVEsRequirements::NeedMax) {
+        if (numAvailableSHAVEs < env.resources.numSHAVEs) {
+            return false;
+        }
+    } else if (reqs == StageSHAVEsRequirements::OnlyOne) {
+        necessarySHAVEsNum = 1;
+    } else if (reqs == StageSHAVEsRequirements::TwoOrOne) {
+        necessarySHAVEsNum = std::min(numAvailableSHAVEs, 2);
+    } else if (reqs == StageSHAVEsRequirements::CanBeLimited) {
+        bool needToLimit = false;
+        if (stage->category() == StageCategory::HW) {
+            needToLimit = true;
+        }
+        for (const auto& prevStage : stage->prevStages()) {
+            if (prevStage->category() == StageCategory::HW) {
+                needToLimit = true;
+                break;
+            }
+        }
+        for (const auto& nextStage : stage->nextStages()) {
+            if (nextStage->category() == StageCategory::HW) {
+                needToLimit = true;
+                break;
+            }
+        }
+
+        if (needToLimit) {
+            necessarySHAVEsNum = std::min(numAvailableSHAVEs, SHAVES_LIMITATION_FOR_HW);
+        }
+    }
+
+    //
+    // Lock SHAVEs
+    //
+
+    _lockedSHAVEs = necessarySHAVEsNum;
+
+    stage->setNumSHAVEs(_lockedSHAVEs);
+
+    return true;
+}
+
+void AllocatorForShaves::freeSHAVEs() {
+    _lockedSHAVEs = 0;
+}
+
+void AllocatorForShaves::reset() {
+    _lockedSHAVEs = 0;
+}
+
+void AllocatorForShaves::selfCheck() {
+    if (_lockedSHAVEs > 0) {
+        VPU_THROW_EXCEPTION << "Internal error in SHAVEs allocation";
+    }
+}
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/backend/backend.cpp b/inference-engine/src/vpu/graph_transformer/src/backend/backend.cpp
new file mode 100644 (file)
index 0000000..1db9325
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/backend/backend.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <assert.h>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/utils/io.hpp>
+
+namespace vpu {
+
+void BackEnd::extractDataInfo(
+        const Model::Ptr& model,
+        DataInfo& inputInfo,
+        DataInfo& outputInfo) {
+    for (const auto& data : model->datas()) {
+        if (DataUsage::Input == data->usage()) {
+            IE_ASSERT(inputInfo.offset.count(data->name()) == 0);
+
+            auto ioBufferOffset = data->attrs().get<int>("ioBufferOffset");
+            IE_ASSERT(ioBufferOffset + data->totalByteSize() <= inputInfo.totalSize);
+
+            inputInfo.offset[data->name()] = ioBufferOffset;
+        } else if (DataUsage::Output == data->usage()) {
+            IE_ASSERT(outputInfo.offset.count(data->name()) == 0);
+
+            auto ioBufferOffset = data->attrs().get<int>("ioBufferOffset");
+            IE_ASSERT(ioBufferOffset + data->totalByteSize() <= outputInfo.totalSize);
+
+            outputInfo.offset[data->name()] = ioBufferOffset;
+        }
+    }
+}
+
+CompiledGraph::Ptr BackEnd::build(
+        const Model::Ptr& model,
+        const std::vector<ie::CNNLayerPtr>& allLayers) {
+    auto compiledGraph = std::make_shared<CompiledGraph>();
+
+    compiledGraph->networkName = model->name();
+    compiledGraph->networkBatch = model->batchSize();
+
+    auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+    compiledGraph->inputBufSize = usedMemory.input;
+    compiledGraph->outputBufSize = usedMemory.output;
+
+    compiledGraph->inputInfo.totalSize  = usedMemory.input;
+    compiledGraph->outputInfo.totalSize = usedMemory.output;
+
+    extractDataInfo(model, compiledGraph->inputInfo, compiledGraph->outputInfo);
+
+    serialize(model, compiledGraph->blob, compiledGraph->blobHeader, compiledGraph->numActiveStages);
+    getMetaData(model, allLayers, compiledGraph->stagesMeta);
+
+    return compiledGraph;
+}
+
+void BackEnd::dumpModel(
+        const Model::Ptr& model,
+        const std::string& postfix) {
+#ifdef NDEBUG
+    (void)model;
+    (void)postfix;
+#else
+    std::string fileName;
+
+    if (auto envVar = std::getenv("IE_VPU_DUMP_INTERNAL_GRAPH_DIRECTORY")) {
+        auto modelName = model->name();
+
+        // Replace "bad" characters
+        for (auto& ch : modelName) {
+            if (!std::isalnum(ch)) {
+                ch = '_';
+            }
+        }
+
+        std::ostringstream ostr;
+        ostr << envVar << "/" << "vpu_graph_" << std::setw(2) << std::setfill('0') << model->attrs().get<int>("index") << "_" << modelName;
+
+        fileName = ostr.str();
+    } else if (auto envVar = std::getenv("IE_VPU_DUMP_INTERNAL_GRAPH_FILE_NAME")) {
+        fileName = envVar;
+    }
+
+    if (fileName.empty()) {
+        return;
+    }
+
+    if (!postfix.empty()) {
+        if (auto envVar = std::getenv("IE_VPU_DUMP_ALL_PASSES")) {
+            if (std::stoi(envVar) == 0) {
+                return;
+            }
+
+            fileName = formatString("%s_%s", fileNameNoExt(fileName), postfix);
+        } else {
+            return;
+        }
+    }
+
+    auto dotFileName = formatString("%s.dot", fileNameNoExt(fileName));
+    dumpModelToDot(model, dotFileName);
+#endif
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/backend/dump_to_dot.cpp b/inference-engine/src/vpu/graph_transformer/src/backend/dump_to_dot.cpp
new file mode 100644 (file)
index 0000000..005e2cb
--- /dev/null
@@ -0,0 +1,384 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef NDEBUG
+
+#include <vpu/backend/backend.hpp>
+
+#include <climits>
+#include <cstring>
+
+#include <string>
+#include <memory>
+#include <list>
+#include <vector>
+#include <array>
+#include <unordered_set>
+#include <set>
+#include <unordered_map>
+#include <fstream>
+#include <utility>
+#include <algorithm>
+#include <map>
+#include <streambuf>
+#include <tuple>
+#include <sstream>
+#include <iomanip>
+#include <atomic>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
+#include <vpu/parsed_config.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+std::string dataDotName(const Data& data) {
+    std::ostringstream ostr;
+    ostr << "data_" << static_cast<const void*>(data.get());
+    return ostr.str();
+}
+
+std::string stageDotName(const Stage& stage) {
+    std::ostringstream ostr;
+    ostr << "stage_" << static_cast<const void*>(stage.get());
+    return ostr.str();
+}
+
+void dumpStageToDot(DotSerializer& out, const Stage& stage, int stageExecIdx) {
+    std::string stageColor = "gold";
+    if (stageExecIdx < 0) {
+        stageColor = "azure";
+    }
+
+    out.append("%s [", stageDotName(stage));
+    {
+        VPU_DOT_IDENT(out);
+
+        out.append("shape=ellipse");
+        out.append("style=filled");
+        out.append("fillcolor=%s", stageColor);
+
+        std::ostringstream caption;
+        caption << "[" << stageExecIdx << " / " << stage->index() << "] Stage " << stage->name();
+
+        DotLabel lbl(caption.str(), out);
+        lbl.appendPair("type", stage->type());
+        if (stage->origLayer() != nullptr) {
+            lbl.appendPair("origLayer", stage->origLayer());
+        }
+        lbl.appendPair("numSHAVEs", stage->numSHAVEs());
+        if (!stage->attrs().empty()) {
+            lbl.appendPair("extraAttrs", stage->attrs());
+        }
+    }
+    out.append("];");
+}
+
+}  // namespace
+
+void BackEnd::dumpModelToDot(
+        const Model::Ptr& model,
+        const std::string& fileName) {
+    VPU_PROFILE(dumpModelToDot);
+
+    std::ofstream file(fileName);
+    if (!file.is_open()) {
+        VPU_THROW_EXCEPTION << "Failed to open DOT file " << fileName;
+    }
+
+    DotSerializer out(file);
+
+    out.append("digraph ie_vpu_graph {");
+    {
+        VPU_DOT_IDENT(out);
+
+        out.append("labelloc=top;");
+        out.append("labeljust=left;");
+
+        {
+            DotLabel lbl("Graph " + model->name(), out);
+            lbl.appendPair("batchSize", model->batchSize());
+            if (!model->attrs().empty()) {
+                lbl.appendPair("extraAttrs", model->attrs());
+            }
+        }
+
+        //
+        // Dump datas
+        //
+
+        for (const auto& data : model->datas()) {
+            std::string dataColor = "white";
+            if (data->usage() == DataUsage::Input) {
+                dataColor = "green";
+            } else if (data->usage() == DataUsage::Output) {
+                dataColor = "deepskyblue";
+            } else if (data->usage() == DataUsage::Const) {
+                dataColor = "aquamarine";
+            } else if (data->usage() == DataUsage::Temp) {
+                dataColor = "cyan";
+            } else if (data->usage() == DataUsage::Intermediate) {
+                if (data->location() == DataLocation::BSS) {
+                    dataColor = "cyan";
+                } else if (data->location() == DataLocation::CMX) {
+                    dataColor = "magenta";
+                } else if (data->location() == DataLocation::Blob) {
+                    dataColor = "aquamarine";
+                } else if (data->location() == DataLocation::Input) {
+                    dataColor = "green";
+                } else if (data->location() == DataLocation::Output) {
+                    dataColor = "deepskyblue";
+                }
+            }
+
+            out.append("%s [", dataDotName(data));
+            {
+                VPU_DOT_IDENT(out);
+
+                out.append("shape=box");
+                out.append("style=filled");
+                out.append("fillcolor=%s", dataColor);
+
+                DotLabel lbl("Data " + data->name(), out);
+                lbl.appendPair("usage", data->usage());
+                lbl.appendPair("desc", data->desc());
+                lbl.appendPair("requiredStrides", data->requiredStrides());
+                lbl.appendPair("strides", data->strides());
+                if (data->origData() != nullptr) {
+                    lbl.appendPair("origData", data->origData());
+                }
+                if (data->content() != nullptr) {
+                    if (data->desc().type() == DataType::U8) {
+                        auto contentPtr = data->content()->get<uint8_t>();
+                        auto count = data->desc().totalDimSize();
+
+                        std::vector<int> temp(
+                            contentPtr,
+                            contentPtr + std::min(count, 8));
+
+                        lbl.appendPair("content", temp);
+                    } else if (data->desc().type() == DataType::FP16) {
+                        auto contentPtr = data->content()->get<fp16_t>();
+                        auto count = data->desc().totalDimSize();
+
+                        std::vector<float> temp(std::min(count, 8));
+                        ie::PrecisionUtils::f16tof32Arrays(temp.data(), contentPtr, temp.size());
+
+                        lbl.appendPair("content", temp);
+                    }
+                }
+                lbl.appendPair("memReqs", data->memReqs());
+                lbl.appendPair("location", data->location());
+                lbl.appendPair("memoryOffset", data->memoryOffset());
+                if (!data->attrs().empty()) {
+                    lbl.appendPair("extraAttrs", data->attrs());
+                }
+            }
+            out.append("];");
+        }
+
+        //
+        // Dump stages
+        //
+
+        int stageExecIdx = 0;
+        for (const auto& stage : model->getStages()) {
+            if (stage->category() == StageCategory::Special) {
+                dumpStageToDot(out, stage, -1);
+            } else {
+                dumpStageToDot(out, stage, stageExecIdx);
+            }
+
+            for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+                dumpStageToDot(out, injectedStageEdge->child(), stageExecIdx);
+            }
+
+            if (stage->category() != StageCategory::Special) {
+                ++stageExecIdx;
+            }
+        }
+
+        //
+        // Dump Stage <-> Data edges
+        //
+
+        for (const auto& stage : model->getStages()) {
+            for (const auto& inEdge : stage->inputEdges()) {
+                out.append("%s -> %s [", dataDotName(inEdge->input()), stageDotName(stage));
+                {
+                    VPU_DOT_IDENT(out);
+
+                    if (inEdge->childEdge() != nullptr) {
+                        out.append("style=dotted");
+                    }
+
+                    DotLabel lbl("StageInput", out);
+                    lbl.appendPair("portInd", inEdge->portInd());
+                    if (!inEdge->attrs().empty()) {
+                        lbl.appendPair("extraAttrs", inEdge->attrs());
+                    }
+                }
+                out.append("];");
+            }
+
+            for (const auto& outEdge : stage->outputEdges()) {
+                out.append("%s -> %s [", stageDotName(stage), dataDotName(outEdge->output()));
+                {
+                    VPU_DOT_IDENT(out);
+
+                    if (outEdge->childEdge() != nullptr) {
+                        out.append("style=dotted");
+                    }
+
+                    DotLabel lbl("StageOutput", out);
+                    lbl.appendPair("portInd", outEdge->portInd());
+                    if (!outEdge->attrs().empty()) {
+                        lbl.appendPair("extraAttrs", outEdge->attrs());
+                    }
+                }
+                out.append("];");
+            }
+
+            for (const auto& tempBufferEdge : stage->tempBufferEdges()) {
+                out.append("%s -> %s [", dataDotName(tempBufferEdge->tempBuffer()), stageDotName(stage));
+                {
+                    VPU_DOT_IDENT(out);
+
+                    if (tempBufferEdge->childEdge() != nullptr) {
+                        out.append("style=dotted");
+                    }
+
+                    DotLabel lbl("Temp buffer", out);
+                    lbl.appendPair("portInd", tempBufferEdge->portInd());
+                    if (!tempBufferEdge->attrs().empty()) {
+                        lbl.appendPair("extraAttrs", tempBufferEdge->attrs());
+                    }
+                }
+                out.append("];");
+            }
+
+            for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+                auto injectedStage = injectedStageEdge->child();
+
+                for (const auto& inEdge : injectedStage->inputEdges()) {
+                    out.append("%s -> %s [", dataDotName(inEdge->input()), stageDotName(injectedStage));
+                    {
+                        VPU_DOT_IDENT(out);
+
+                        DotLabel lbl("StageInput", out);
+                        lbl.appendPair("portInd", inEdge->portInd());
+                        if (!inEdge->attrs().empty()) {
+                            lbl.appendPair("extraAttrs", inEdge->attrs());
+                        }
+                    }
+                    out.append("];");
+                }
+
+                for (const auto& outEdge : injectedStage->outputEdges()) {
+                    out.append("%s -> %s [", stageDotName(injectedStage), dataDotName(outEdge->output()));
+                    {
+                        VPU_DOT_IDENT(out);
+
+                        DotLabel lbl("StageOutput", out);
+                        lbl.appendPair("portInd", outEdge->portInd());
+                        if (!outEdge->attrs().empty()) {
+                            lbl.appendPair("extraAttrs", outEdge->attrs());
+                        }
+                    }
+                    out.append("];");
+                }
+
+                for (const auto& tempBufferEdge : injectedStage->tempBufferEdges()) {
+                    out.append("%s -> %s [", dataDotName(tempBufferEdge->tempBuffer()), stageDotName(injectedStage));
+                    {
+                        VPU_DOT_IDENT(out);
+
+                        DotLabel lbl("Temp buffer", out);
+                        lbl.appendPair("portInd", tempBufferEdge->portInd());
+                        if (!tempBufferEdge->attrs().empty()) {
+                            lbl.appendPair("extraAttrs", tempBufferEdge->attrs());
+                        }
+                    }
+                    out.append("];");
+                }
+            }
+        }
+
+        //
+        // Dump Data<->Data edges
+        //
+
+        for (const auto& data : model->datas()) {
+            if (auto edge = data->parentDataEdge()) {
+                out.append("%s -> %s [", dataDotName(edge->child()), dataDotName(edge->parent()));
+                {
+                    VPU_DOT_IDENT(out);
+
+                    out.append("style=dotted");
+
+                    DotLabel lbl("SharedAllocation", out);
+                    lbl.appendPair("mode", edge->mode());
+                    lbl.appendPair("order", edge->order());
+                    if (!edge->attrs().empty()) {
+                        lbl.appendPair("extraAttrs", edge->attrs());
+                    }
+                }
+                out.append("];");
+            }
+        }
+
+        //
+        // Dump Stage<->Stage edges
+        //
+
+        for (const auto& stage : model->getStages()) {
+            for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+                out.append("%s -> %s [", stageDotName(stage), stageDotName(injectedStageEdge->child()));
+                {
+                    VPU_DOT_IDENT(out);
+
+                    out.append("style=dotted");
+
+                    DotLabel lbl("Injected Stage", out);
+                    lbl.appendPair("portInd", injectedStageEdge->portInd());
+                    if (!injectedStageEdge->attrs().empty()) {
+                        lbl.appendPair("extraAttrs", injectedStageEdge->attrs());
+                    }
+                }
+                out.append("];");
+            }
+
+            if (stage->numInjectedStages() > 0) {
+                out.append("{");
+                {
+                    VPU_DOT_IDENT(out);
+
+                    out.append("rank=same;");
+
+                    out.append("%s", stageDotName(stage));
+
+                    for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+                        out.append(", %s", stageDotName(injectedStageEdge->child()));
+                    }
+                }
+                out.append("}");
+            }
+        }
+    }
+    out.append("}");
+}
+
+}  // namespace vpu
+
+#endif
diff --git a/inference-engine/src/vpu/graph_transformer/src/backend/get_meta_data.cpp b/inference-engine/src/vpu/graph_transformer/src/backend/get_meta_data.cpp
new file mode 100644 (file)
index 0000000..ea34695
--- /dev/null
@@ -0,0 +1,151 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/backend/backend.hpp>
+
+#include <climits>
+#include <cstring>
+
+#include <string>
+#include <memory>
+#include <list>
+#include <vector>
+#include <array>
+#include <unordered_set>
+#include <set>
+#include <unordered_map>
+#include <fstream>
+#include <utility>
+#include <algorithm>
+#include <map>
+#include <streambuf>
+#include <tuple>
+#include <sstream>
+#include <iomanip>
+#include <atomic>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
+#include <vpu/parsed_config.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+
+namespace vpu {
+
+void BackEnd::getMetaData(
+        const Model::Ptr& model,
+        const std::vector<ie::CNNLayerPtr>& allLayers,
+        std::vector<StageMetaInfo>& metaData) {
+    VPU_PROFILE(getMetaData);
+
+    metaData.clear();
+    metaData.reserve(3 * model->numStages() / 2 + 1);
+
+    std::unordered_set<ie::CNNLayerPtr> visitedLayers;
+
+    auto getStageMeta = [&visitedLayers](const Stage& stage) -> StageMetaInfo {
+        StageMetaInfo meta;
+
+        meta.stageName = stage->name();
+        meta.stageType = toString(stage->type());
+
+        if (stage->numInjectedStages() > 0) {
+            meta.stageName += " + injected[";
+            meta.stageType += " + injected[";
+
+            int ind = 0;
+            for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+                if (ind != 0) {
+                    meta.stageName += ", ";
+                    meta.stageType += ", ";
+                }
+
+                meta.stageName += injectedStageEdge->child()->name();
+                meta.stageType += toString(injectedStageEdge->child()->type());
+
+                ++ind;
+            }
+
+            meta.stageName += "]";
+            meta.stageType += "]";
+        }
+
+        if (stage->origLayer() == nullptr) {
+            meta.layerName = "<Extra>";
+            meta.layerType = "<Extra>";
+        } else {
+            meta.layerName = stage->origLayer()->name;
+            meta.layerType = stage->origLayer()->type;
+            visitedLayers.insert(stage->origLayer());
+        }
+
+        return meta;
+    };
+
+    //
+    // Add real stages
+    //
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() == StageCategory::Special) {
+            continue;
+        }
+
+        auto meta = getStageMeta(stage);
+        meta.status = ie::InferenceEngineProfileInfo::EXECUTED;
+        metaData.emplace_back(std::move(meta));
+    }
+
+    //
+    // Receive-Tensor time
+    //
+
+    // TODO : support config to disable timings and not to add this meta if it is not required by user
+    StageMetaInfo receiveTensorMeta;
+    receiveTensorMeta.stageName = "<Receive-Tensor>";
+    receiveTensorMeta.stageType = "<Receive-Tensor>";
+    receiveTensorMeta.layerName = "<Receive-Tensor>";
+    receiveTensorMeta.layerType = "<Receive-Tensor>";
+    receiveTensorMeta.status = ie::InferenceEngineProfileInfo::EXECUTED;
+    metaData.emplace_back(std::move(receiveTensorMeta));
+
+    //
+    // Add special stages
+    //
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::Special) {
+            continue;
+        }
+
+        auto meta = getStageMeta(stage);
+        meta.status = ie::InferenceEngineProfileInfo::OPTIMIZED_OUT;
+        metaData.emplace_back(std::move(meta));
+    }
+
+    //
+    // Add optimized layers
+    //
+
+    for (const auto& layer : allLayers) {
+        if (visitedLayers.count(layer) != 0) {
+            continue;
+        }
+
+        StageMetaInfo meta;
+        meta.stageName = "<none>";
+        meta.stageType = "<none>";
+        meta.layerName = layer->name;
+        meta.layerType = layer->type;
+        meta.status = ie::InferenceEngineProfileInfo::LayerStatus::OPTIMIZED_OUT;
+        metaData.emplace_back(std::move(meta));
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp b/inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp
new file mode 100644 (file)
index 0000000..b676c14
--- /dev/null
@@ -0,0 +1,227 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/backend/backend.hpp>
+
+#include <climits>
+#include <cstring>
+
+#include <string>
+#include <memory>
+#include <list>
+#include <vector>
+#include <array>
+#include <unordered_set>
+#include <set>
+#include <unordered_map>
+#include <fstream>
+#include <utility>
+#include <algorithm>
+#include <map>
+#include <streambuf>
+#include <tuple>
+#include <sstream>
+#include <iomanip>
+#include <atomic>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
+#include <vpu/parsed_config.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+void BackEnd::serialize(
+        const Model::Ptr& model,
+        std::vector<char>& blob,
+        std::pair<char*, size_t>& blobHeader,
+        int& numActiveStages) {
+    VPU_PROFILE(serialize);
+
+    const auto& env = CompileEnv::get();
+
+    auto batchSize = model->batchSize();
+    auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+
+    //
+    // Remove special stages from the stages list
+    //
+
+    bool hasHwStage = false;
+    bool hasShaveStage = false;
+    bool hasDmaStage = false;
+
+    StageVector execStages;
+    execStages.reserve(model->numStages());
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() == StageCategory::Special) {
+            continue;
+        }
+
+        if (stage->category() == StageCategory::HW) {
+            hasHwStage = true;
+        } else if (stage->category() == StageCategory::SHAVE) {
+            hasShaveStage = true;
+        } else if (stage->category() == StageCategory::DMA) {
+            hasDmaStage = true;
+        }
+
+        execStages.emplace_back(stage);
+    }
+
+    numActiveStages = execStages.size();
+
+    //
+    // I/O info sections
+    //
+
+    int numInputs = 0;
+    BlobSerializer inputInfoSerializer;
+    for (const auto& data : model->datas()) {
+        if (data->usage() != DataUsage::Input) {
+            continue;
+        }
+
+        IE_ASSERT(data->producerEdge() == nullptr);
+        IE_ASSERT(data->parentDataEdge() == nullptr);
+        IE_ASSERT(data->numConsumers() != 0);
+
+        IE_ASSERT(!data->attrs().has("ioIdx"));
+        data->attrs().set("ioIdx", numInputs);
+
+        data->serializeIOInfo(inputInfoSerializer);
+
+        ++numInputs;
+    }
+
+    int numOutputs = 0;
+    BlobSerializer outputInfoSerializer;
+    for (const auto& data : model->datas()) {
+        if (data->usage() != DataUsage::Output) {
+            continue;
+        }
+
+        IE_ASSERT(data->producerEdge() != nullptr);
+        IE_ASSERT(data->parentDataEdge() == nullptr);
+
+        IE_ASSERT(!data->attrs().has("ioIdx"));
+        data->attrs().set("ioIdx", numOutputs);
+
+        data->serializeIOInfo(outputInfoSerializer);
+
+        ++numOutputs;
+    }
+
+    //
+    // Stages section
+    //
+
+
+    BlobSerializer stagesSerializer;
+    for (const auto& stage : execStages) {
+        stage->serialize(stagesSerializer);
+    }
+
+    //
+    // Elf header
+    //
+
+    ElfN_Ehdr elfHdr = {};
+    elfHdr.e_ident[0] = 0x7f;
+    elfHdr.e_ident[1] = 'e';
+    elfHdr.e_ident[2] = 'l';
+    elfHdr.e_ident[3] = 'f';
+    for (int i = 4; i < 16; i++) {
+        elfHdr.e_ident[i] = 0;
+    }
+    elfHdr.e_type = 1;
+    elfHdr.e_machine = 2;
+    elfHdr.e_version = 2;
+    elfHdr.e_entry = 0;
+    elfHdr.e_phoff = 0;
+    elfHdr.e_shoff = 0;
+    elfHdr.e_ehsize = 8 * sizeof(elfHdr);
+
+    //
+    // Blob header
+    //
+
+    auto hdrSize = alignVal<int>(sizeof(ElfN_Ehdr) + sizeof(mv_blob_header), 64);
+    auto inputInfoSecSize = alignVal(inputInfoSerializer.size(), 64);
+    auto outputInfoSecSize = alignVal(outputInfoSerializer.size(), 64);
+    auto stagesSecSize = alignVal(stagesSerializer.size(), 64);
+    auto constDataSecSize = alignVal(usedMemory.blob, 64);
+
+    mv_blob_header blobHdr = {};
+    blobHdr.magic_number = BLOB_MAGIC_NUMBER;
+    blobHdr.file_size = checked_cast<uint32_t>(hdrSize + inputInfoSecSize + outputInfoSecSize + stagesSecSize + constDataSecSize);
+    blobHdr.blob_ver_major = BLOB_VERSION_MAJOR;
+    blobHdr.blob_ver_minor = BLOB_VERSION_MINOR;
+    blobHdr.inputs_count = checked_cast<uint32_t>(numInputs);
+    blobHdr.outputs_count = checked_cast<uint32_t>(numOutputs);
+    blobHdr.stages_count = checked_cast<uint32_t>(execStages.size());
+    blobHdr.inputs_size = checked_cast<uint32_t>(usedMemory.input);
+    blobHdr.outputs_size = checked_cast<uint32_t>(usedMemory.output);
+    blobHdr.batch_size = checked_cast<uint32_t>(batchSize);
+    blobHdr.bss_mem_size = checked_cast<uint32_t>(usedMemory.BSS);
+    blobHdr.number_of_cmx_slices = checked_cast<uint32_t>(env.resources.numCMXSlices);
+    blobHdr.number_of_shaves = checked_cast<uint32_t>(env.resources.numSHAVEs);
+    blobHdr.has_hw_stage = checked_cast<uint32_t>(hasHwStage);
+    blobHdr.has_shave_stage = checked_cast<uint32_t>(hasShaveStage);
+    blobHdr.has_dma_stage = checked_cast<uint32_t>(hasDmaStage);
+    blobHdr.input_info_section_offset = checked_cast<uint32_t>(hdrSize);
+    blobHdr.output_info_section_offset = checked_cast<uint32_t>(blobHdr.input_info_section_offset + inputInfoSecSize);
+    blobHdr.stage_section_offset = checked_cast<uint32_t>(blobHdr.output_info_section_offset + outputInfoSecSize);
+    blobHdr.const_data_section_offset = checked_cast<uint32_t>(blobHdr.stage_section_offset + stagesSecSize);
+
+    //
+    // Generate fathom blob
+    //
+
+    blob.clear();
+    blob.resize(blobHdr.file_size, 0);
+
+    std::copy_n(&elfHdr, 1, reinterpret_cast<ElfN_Ehdr*>(blob.data()));
+    std::copy_n(&blobHdr, 1, reinterpret_cast<mv_blob_header*>(blob.data() + sizeof(elfHdr)));
+    std::copy_n(inputInfoSerializer.data(), inputInfoSerializer.size(), blob.data() + blobHdr.input_info_section_offset);
+    std::copy_n(outputInfoSerializer.data(), outputInfoSerializer.size(), blob.data() + blobHdr.output_info_section_offset);
+    std::copy_n(stagesSerializer.data(), stagesSerializer.size(), blob.data() + blobHdr.stage_section_offset);
+
+    for (const auto& data : model->datas()) {
+        if (data->usage() != DataUsage::Const) {
+            continue;
+        }
+
+        IE_ASSERT(data->producerEdge() == nullptr);
+        IE_ASSERT(data->parentDataEdge() == nullptr);
+        IE_ASSERT(data->numConsumers() != 0);
+        IE_ASSERT(data->location() == DataLocation::Blob);
+
+        auto content = data->content();
+        IE_ASSERT(content != nullptr);
+
+        std::copy_n(content->get<uint8_t>(), data->totalByteSize(), blob.data() + blobHdr.const_data_section_offset + data->memoryOffset());
+    }
+
+    //
+    // Blob header spec begin containing elf header and blobHeader
+    //
+
+    blobHeader.first = blob.data();
+    blobHeader.second = sizeof(ElfN_Ehdr) + sizeof(mv_blob_header);
+
+    env.log->info("blobSize=%d", sizeof(char) * blob.size());
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp b/inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp
new file mode 100644 (file)
index 0000000..bd9bbbf
--- /dev/null
@@ -0,0 +1,196 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/blob_reader.hpp>
+
+#include <sstream>
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <ie_input_info.hpp>
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+namespace {
+
+template <typename T>
+T readFromBlob(const std::vector<char>& blob, uint32_t& offset) {
+    IE_ASSERT(offset + sizeof(T) <= blob.size());
+
+    auto srcPtr = blob.data() + offset;
+    offset += sizeof(T);
+
+    return *reinterpret_cast<const T*>(srcPtr);
+}
+
+ie::Precision vpuDataTypeToIE(DataType dataType) {
+    auto iePrecision = ie::Precision::UNSPECIFIED;
+
+    switch (dataType) {
+    case DataType::U8:
+        iePrecision = ie::Precision::U8;
+        break;
+    case DataType::FP16:
+        iePrecision = ie::Precision::FP16;
+        break;
+    case DataType::FP32:
+        iePrecision = ie::Precision::FP32;
+        break;
+    default:
+        VPU_THROW_EXCEPTION << "BlobReader error: unsupported dataType " << dataType;
+    }
+
+    return iePrecision;
+}
+
+ie::Layout vpuDimsOrderToIE(DimsOrder dimsOrder) {
+    auto ieLayout = ie::Layout::ANY;
+
+    if (DimsOrder::C == dimsOrder) {
+        ieLayout = ie::Layout::C;
+    } else if (DimsOrder::NC == dimsOrder) {
+        ieLayout = ie::Layout::NC;
+    } else if (DimsOrder::CHW == dimsOrder) {
+        ieLayout = ie::Layout::CHW;
+    } else if (DimsOrder::NCHW == dimsOrder) {
+        ieLayout = ie::Layout::NCHW;
+    } else if (DimsOrder::NHWC == dimsOrder) {
+        ieLayout = ie::Layout::NHWC;
+    } else {
+        VPU_THROW_EXCEPTION << "BlobReader error: unsupported dimsOrder " << toString(dimsOrder);
+    }
+
+    return ieLayout;
+}
+
+ie::SizeVector vpuDimsToIE(const DimValues& dimValues) {
+    auto order = DimsOrder::fromNumDims(dimValues.size());
+    auto perm = order.toPermutation();
+
+    ie::SizeVector ieDims(perm.size());
+    for (int i = 0; i < perm.size(); ++i) {
+        ieDims[ieDims.size() - 1 - i] = dimValues[perm[i]];
+    }
+
+    return ieDims;
+}
+
+}  // namespace
+
+void BlobReader::parse(const std::vector<char>& blob) {
+    if (blob.empty() || blob.size() < sizeof(ElfN_Ehdr) + sizeof(mv_blob_header)) {
+        VPU_THROW_EXCEPTION << "BlobReader error: Blob is empty";
+    }
+
+    _pBlob = blob.data();
+
+    _blobHeader = *reinterpret_cast<const mv_blob_header*>(blob.data() + sizeof(ElfN_Ehdr));
+    if (_blobHeader.magic_number != BLOB_MAGIC_NUMBER) {
+        VPU_THROW_EXCEPTION << "BlobReader error: The magic number imported blob doesn't match graph_transformer";
+    }
+    if (_blobHeader.blob_ver_major != BLOB_VERSION_MAJOR || _blobHeader.blob_ver_minor != BLOB_VERSION_MINOR) {
+        VPU_THROW_EXCEPTION << "BlobReader error: The version of imported blob doesn't match graph_transformer";
+    }
+
+    _inputInfo.totalSize = _blobHeader.inputs_size;
+    _outputInfo.totalSize = _blobHeader.outputs_size;
+
+    auto inputInfoSecOffset = _blobHeader.input_info_section_offset;
+    for (uint32_t i = 0; i < _blobHeader.inputs_count; i++) {
+        auto ioIdx = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+        IE_ASSERT(ioIdx == i);
+
+        auto ioBufferOffset = readFromBlob<int32_t>(blob, inputInfoSecOffset);
+
+        auto nameLength = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+        std::string inputName(nameLength, 0);
+        for (auto& c : inputName) {
+            c = readFromBlob<char>(blob, inputInfoSecOffset);
+        }
+
+        // Truncate zeros
+        inputName = inputName.c_str();
+
+        auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+        auto orderCode = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+
+        auto numDims = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+
+        auto dimsOrder = DimsOrder::fromCode(orderCode);
+        auto perm = dimsOrder.toPermutation();
+        IE_ASSERT(perm.size() == numDims);
+
+        DimValues vpuDims;
+        for (int i = 0; i < perm.size(); ++i) {
+            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+        }
+
+        // Skip strides
+        inputInfoSecOffset += perm.size() * sizeof(uint32_t);
+
+        auto iePrecision = vpuDataTypeToIE(dataType);
+        auto ieLayout    = vpuDimsOrderToIE(dimsOrder);
+        auto ieDims = vpuDimsToIE(vpuDims);
+
+        ie::TensorDesc ieDesc(iePrecision, ieDims, ieLayout);
+        ie::Data inputData(inputName, ieDesc);
+
+        ie::InputInfo input;
+        input.setInputData(std::make_shared<ie::Data>(inputData));
+
+        _networkInputs[input.name()]    = std::make_shared<ie::InputInfo>(input);
+        _inputInfo.offset[input.name()] = ioBufferOffset;
+    }
+
+    auto outputInfoSecOffset = _blobHeader.output_info_section_offset;
+    for (size_t i = 0; i < _blobHeader.outputs_count; i++) {
+        auto ioIdx = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+        IE_ASSERT(ioIdx == i);
+
+        auto ioBufferOffset = readFromBlob<int32_t>(blob, outputInfoSecOffset);
+
+        auto nameLength = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+        std::string outputName(nameLength, 0);
+        for (auto& c : outputName) {
+            c = readFromBlob<char>(blob, outputInfoSecOffset);
+        }
+
+        // Truncate zeros
+        outputName = outputName.c_str();
+
+        auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+        auto orderCode = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+
+        auto numDims = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+
+        auto dimsOrder = DimsOrder::fromCode(orderCode);
+        auto perm = dimsOrder.toPermutation();
+        IE_ASSERT(perm.size() == numDims);
+
+        DimValues vpuDims;
+        for (int i = 0; i < perm.size(); ++i) {
+            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+        }
+
+        // Skip strides
+        outputInfoSecOffset += perm.size() * sizeof(uint32_t);
+
+        auto iePrecision = vpuDataTypeToIE(dataType);
+        auto ieLayout    = vpuDimsOrderToIE(dimsOrder);
+        auto ieDims = vpuDimsToIE(vpuDims);
+
+        ie::TensorDesc ieDesc(iePrecision, ieDims, ieLayout);
+        ie::Data outputData(outputName, ieDesc);
+
+        _networkOutputs[outputData.name]    = std::make_shared<ie::Data>(outputData);
+        _outputInfo.offset[outputData.name] = ioBufferOffset;
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/custom_layer.cpp b/inference-engine/src/vpu/graph_transformer/src/custom_layer.cpp
new file mode 100644 (file)
index 0000000..12d294a
--- /dev/null
@@ -0,0 +1,594 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/custom_layer.hpp>
+
+#include <climits>
+
+#include <map>
+#include <fstream>
+#include <streambuf>
+#include <tuple>
+#include <utility>
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef __linux__
+# include <dlfcn.h>
+#endif
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+#include <details/caseless.hpp>
+
+#include <vpu/utils/simple_math.hpp>
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+namespace {
+
+VPU_PACKED(Elf32Ehdr {
+    uint8_t  offs1[28];
+    uint32_t ePhoff;        // Program header offset
+    uint32_t eShoff;        // Section header offset
+    uint8_t  offs2[12];
+    uint16_t eShnum;        // Number of sections
+    uint16_t offs3;
+};)
+
+VPU_PACKED(Elf32Section {
+    uint32_t shName;
+    uint32_t shType;
+    uint32_t shFlags;
+    uint32_t shAddr;
+    uint32_t shOffset;
+    uint32_t shSize;
+    uint32_t shLink;
+    uint32_t shInfo;
+    uint32_t shAddralign;
+    uint32_t shEntsize;
+};)
+
+VPU_PACKED(Elf32Phdr {
+    uint32_t pType;       // Identifies program segment type
+    uint32_t pOffset;     // Segment file offset
+    uint32_t pVaddr;      // Segment virtual address
+    uint32_t pPaddr;      // Segment physical address
+    uint32_t pFilesz;     // Segment size in file
+    uint32_t pMemsz;      // Segment size in memory
+    uint32_t pFlags;      // Flags position from ELF standard spec
+    uint32_t pAlign;      // Segment alignment, file & memory
+};)
+
+VPU_PACKED(Elf32Sym {
+    uint32_t stName;
+    uint32_t stValue;
+    uint32_t stSize;
+    uint8_t  stInfo;
+    uint8_t  stOther;
+    uint16_t stShndx;
+};)
+
+VPU_PACKED(KernelHdr {
+    uint32_t address;       // Kernel address
+    uint32_t flags;         // Should be 0 for now
+    uint32_t sectionSize;   // Section size, offset to the next kernel
+    uint32_t argOffset;     // offset to arguments
+    uint32_t stackSize;     // Size of the stack required for kernel
+};)
+
+VPU_PACKED(KernelArgHdr {
+    uint32_t stringOffset;
+    uint32_t addressSpace;
+    uint32_t typeOffset;
+    uint32_t size;
+    uint32_t laneSize;
+};)
+
+std::pair<const Elf32Section*, const Elf32Section*> findSymbolTable(
+        const char* ELFData) {
+    const uint32_t SYMTAB = 2;  // Link editing symbol table
+    const uint32_t STRTAB = 3;  // A string table
+
+    IE_ASSERT(ELFData != nullptr);
+
+    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
+    auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
+
+    const Elf32Section* strShdr = nullptr;
+    const Elf32Section* symShdr = nullptr;
+    for (size_t i = 0; i < ehdr->eShnum; i++) {
+        if (shdr[i].shType == STRTAB && strShdr == nullptr) {
+            strShdr = &shdr[i];
+        } else if (shdr[i].shType == SYMTAB && symShdr == nullptr) {
+            symShdr = &shdr[i];
+        }
+
+        if (symShdr != nullptr && strShdr != nullptr)
+            break;
+    }
+    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
+
+    return std::make_pair(strShdr, symShdr);
+}
+
+uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(ELFData != nullptr);
+
+    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
+    auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
+
+    const Elf32Section* strShdr = nullptr;
+    const Elf32Section* symShdr = nullptr;
+    std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
+    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
+
+    auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
+    auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
+    auto firstStr = ELFData + strShdr->shOffset;
+
+    for (size_t i = 0; i < numSymEntries; i++) {
+        if (cmp(firstStr + sym[i].stName, kernelName)) {
+            return sym[i].stValue - phdr->pVaddr;
+        }
+    }
+
+    VPU_THROW_EXCEPTION << "Cannot find kernel entry point for custom kernel " << kernelName;
+}
+
+std::vector<std::string> deduceKernelParameters(
+        const char* ELFData,
+        uint32_t kernelAddress) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(ELFData != nullptr);
+
+    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
+    auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
+    auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
+
+    const Elf32Section* strShdr = nullptr;
+    const Elf32Section* symShdr = nullptr;
+    std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
+    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
+
+    auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
+    auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
+    auto firstStr = ELFData + strShdr->shOffset;
+
+    const char* kernelArgStrings = nullptr;
+    for (size_t i = 0; i < numSymEntries; i++) {
+        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
+            kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
+            break;
+        }
+    }
+    IE_ASSERT(kernelArgStrings != nullptr);
+
+    std::vector<std::string> parameters;
+    for (size_t i = 0; i < numSymEntries; i++) {
+        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
+            auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
+            auto numKernels = *reinterpret_cast<const int*>(ptr);
+
+            auto metaOffset = sizeof(int);
+            for (int k = 0; k < numKernels; k++) {
+                auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
+
+                if (kHdr->address-phdr->pVaddr == kernelAddress) {
+                    auto aHdr = reinterpret_cast<const KernelArgHdr*>(
+                        reinterpret_cast<const char*>(&(kHdr->argOffset)) + sizeof(kHdr->argOffset) + kHdr->argOffset);
+
+                    auto numArgs = reinterpret_cast<const int*>(kHdr + 1)[(kHdr->flags == 1) ? 2 : 0];
+                    for (int n = 0; n < numArgs; n++, aHdr++) {
+                        parameters.push_back(kernelArgStrings + aHdr->stringOffset);
+                    }
+
+                    break;
+                }
+
+                metaOffset += kHdr->sectionSize;
+            }
+        }
+    }
+
+    return parameters;
+}
+
+std::pair<uint32_t, uint32_t> deduceVectorized(
+        const char* ELFData,
+        uint32_t kernelAddress) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(ELFData != nullptr);
+
+    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
+    auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
+    auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
+
+    const Elf32Section* strShdr = nullptr;
+    const Elf32Section* symShdr = nullptr;
+    std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
+    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
+
+    auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
+    auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
+    auto firstStr = ELFData + strShdr->shOffset;
+
+    const char* kernelArgStrings = nullptr;
+    for (size_t i = 0; i < numSymEntries; i++) {
+        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
+            kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
+            break;
+        }
+    }
+    IE_ASSERT(kernelArgStrings != nullptr);
+
+    for (size_t i = 0; i < numSymEntries; i++) {
+        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
+            auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
+            auto numKernels = *reinterpret_cast<const int*>(ptr);
+
+            auto metaOffset = sizeof(int);
+            for (int k = 0; k < numKernels; k++) {
+                auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
+
+                if (kHdr->address-phdr->pVaddr == kernelAddress && kHdr->flags == 1) {
+                    auto vecInfo = reinterpret_cast<const uint32_t*>(kHdr + 1);
+                    return std::make_pair(vecInfo[1], vecInfo[0]-phdr->pVaddr);
+                }
+
+                metaOffset += kHdr->sectionSize;
+            }
+        }
+    }
+
+    return std::make_pair(0, 0);
+}
+
+}  // namespace
+
+ie::details::caseless_map<std::string, CustomLayer::Ptr> CustomLayer::loadFromFile(
+        const std::string& configFile,
+        bool canBeMissed) {
+    ie::details::caseless_map<std::string, CustomLayer::Ptr> out;
+
+    pugi::xml_document xmlDoc;
+    pugi::xml_parse_result res = xmlDoc.load_file(configFile.c_str());
+
+    if (res.status != pugi::status_ok) {
+        if (canBeMissed) {
+            // Config file might not exist - like global config, for example.
+            return out;
+        } else {
+            VPU_THROW_EXCEPTION
+                << "Failed to load custom layer configuration file " << configFile
+                << " : " << res.description()
+                << " at offset " << res.offset;
+        }
+    }
+
+#ifdef _WIN32
+    char path[MAX_PATH];
+    auto abs_path_ptr = _fullpath(path, configFile.c_str(), MAX_PATH);
+#elif __linux__
+    char path[PATH_MAX];
+    auto abs_path_ptr = realpath(configFile.c_str(), path);
+#endif
+
+    if (abs_path_ptr == nullptr) {
+        VPU_THROW_EXCEPTION
+            << "Failed to load custom layer configuration file " << configFile
+            << " : can't get canonicalized absolute path";
+    }
+
+    std::string abs_file_name(path);
+
+    // Try extracting directory from config path.
+    auto dir_split_pos = abs_file_name.find_last_of("/\\");
+    auto colon_pos = abs_file_name.find_first_of(":");
+    auto first_slash_pos = abs_file_name.find_first_of("/");
+
+    // If path is absolute.
+    std::string dir_path;
+    if (dir_split_pos != std::string::npos && (colon_pos != std::string::npos || first_slash_pos == 0)) {
+        dir_path = abs_file_name.substr(0, dir_split_pos);
+    } else {
+        VPU_THROW_EXCEPTION
+            << "Failed to load custom layer configuration file " << configFile
+            << " : path is not valid";
+    }
+
+    for (auto r = xmlDoc.document_element(); r; r = r.next_sibling()) {
+        CustomLayer::Ptr layer(new CustomLayer(dir_path));
+
+        layer->loadSingleLayer(r);
+
+        out[layer->_layerName] = layer;
+    }
+
+    return out;
+}
+
+int CustomLayer::kernelAddress(int idx) const {
+    for (const auto& x : _kernelAddress) {
+        if ((x.first % idx) == 0) {
+            return x.second;
+        }
+    }
+
+    auto it = _kernelAddress.find(1);
+    IE_ASSERT(it != _kernelAddress.end());
+
+    return it->second;
+}
+
+void CustomLayer::loadSingleLayer(const pugi::xml_node& node) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    std::string nodeName(node.name());
+    if (!cmp(nodeName, "CustomLayer")) {
+        VPU_THROW_EXCEPTION << "Wrong custom layer XML : Node is not CustomLayer, but " << nodeName;
+    }
+
+    auto nodeType = XMLParseUtils::GetStrAttr(node, "type", "");
+    if (!cmp(nodeType, "MVCL")) {
+        VPU_THROW_EXCEPTION << "Wrong custom layer XML : Type is not MVCL, but " << nodeType;
+    }
+
+    auto version = XMLParseUtils::GetIntAttr(node, "version", -1);
+    IE_ASSERT(version == 1);
+
+    _layerName = XMLParseUtils::GetStrAttr(node, "name", "");
+    if (_layerName.empty()) {
+        VPU_THROW_EXCEPTION << "Missing Layer name in CustomLayer";
+    }
+
+    processKernelNode(node.child("Kernel"));
+
+    processParametersNode(node.child("Parameters"));
+
+    processWorkSizesNode(node.child("WorkSizes"));
+}
+
+void CustomLayer::processKernelNode(const pugi::xml_node& node) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    std::string nodeName(node.name());
+    if (!cmp(nodeName, "Kernel")) {
+        VPU_THROW_EXCEPTION << "Wrong node, expected Kernel found " << nodeName;
+    }
+
+    if (!_kernelBinary.empty()) {
+        VPU_THROW_EXCEPTION << "Multiple definition of Kernel";
+    }
+
+    _kernelEntry = XMLParseUtils::GetStrAttr(node, "entry", "");
+    if (_kernelEntry.empty()) {
+        VPU_THROW_EXCEPTION << "No Kernel entry in custom layer";
+    }
+
+    _kernelBinary.clear();
+    for (auto sourceNode = node.child("Source"); !sourceNode.empty(); sourceNode = sourceNode.next_sibling("Source")) {
+        auto fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(sourceNode, "filename", "");
+
+        std::ifstream inputFile(fileName, std::ios::binary);
+        if (!inputFile.is_open()) {
+            VPU_THROW_EXCEPTION << "Couldn't open kernel file " << fileName;
+        }
+
+        std::ostringstream contentStream;
+        contentStream << inputFile.rdbuf();
+        _kernelBinary.append(contentStream.str());
+    }
+
+    _kernelAddress[1] = getKernelEntry(&_kernelBinary[0], _kernelEntry);
+    _parameters = deduceKernelParameters(&_kernelBinary[0], _kernelAddress[1]);
+
+    auto vecInfo = deduceVectorized(&_kernelBinary[0], _kernelAddress[1]);
+    if (vecInfo.first != 0) {
+        _kernelAddress[vecInfo.first] = vecInfo.second;
+    }
+}
+
+void CustomLayer::processParametersNode(const pugi::xml_node& node) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    std::string nodeName(node.name());
+    if (!cmp(nodeName, "Parameters")) {
+        VPU_THROW_EXCEPTION << "Wrong node, expected Parameters found " << nodeName;
+    }
+
+    for (auto tensorNode = node.child("Tensor"); !tensorNode.empty(); tensorNode = tensorNode.next_sibling("Tensor")) {
+        KernelParam kp;
+
+        auto typeStr = XMLParseUtils::GetStrAttr(tensorNode, "type");
+        if (cmp(typeStr, "input")) {
+            kp.type = CustomParamType::Input;
+        } else if (cmp(typeStr, "output")) {
+            kp.type = CustomParamType::Output;
+        } else {
+            VPU_THROW_EXCEPTION << "Tensor node has an invalid type " << typeStr;
+        }
+
+        kp.format = formatFromString(XMLParseUtils::GetStrAttr(tensorNode, "format", "BFYX"));
+        if (kp.format == CustomDataFormat::None) {
+            VPU_THROW_EXCEPTION << "Tensor node has an invalid format " << kp.format;
+        }
+
+        kp.argName = XMLParseUtils::GetStrAttr(tensorNode, "arg-name");
+        if (kp.argName.empty()) {
+            VPU_THROW_EXCEPTION << "Tensor node has no arg-name";
+        }
+
+        kp.portIndex = XMLParseUtils::GetIntAttr(tensorNode, "port-index", -1);
+        if (kp.portIndex == -1) {
+            VPU_THROW_EXCEPTION << "Tensor node has no port-index";
+        }
+
+        kp.irSource.clear();
+
+        _kernelParams.emplace_back(std::move(kp));
+    }
+
+    for (auto dataNode = node.child("Data"); !dataNode.empty(); dataNode = dataNode.next_sibling("Data")) {
+        KernelParam kp;
+
+        kp.type = CustomParamType::Data;
+        kp.format = CustomDataFormat::Any;
+
+        kp.argName = XMLParseUtils::GetStrAttr(dataNode, "arg-name");
+        if (kp.argName.empty()) {
+            VPU_THROW_EXCEPTION << "Data node has no arg-name";
+        }
+
+        kp.portIndex = -1;
+
+        kp.irSource = XMLParseUtils::GetStrAttr(dataNode, "source", "");
+        if (kp.irSource.empty()) {
+            VPU_THROW_EXCEPTION << "Data node has no source";
+        }
+
+        _kernelParams.emplace_back(std::move(kp));
+    }
+
+    for (auto scalarNode = node.child("Scalar"); !scalarNode.empty(); scalarNode = scalarNode.next_sibling("Scalar")) {
+        KernelParam kp;
+
+        std::string typeStr = XMLParseUtils::GetStrAttr(scalarNode, "type");
+        if (cmp(typeStr, "int")) {
+            kp.type = CustomParamType::Int;
+        } else if (cmp(typeStr, "float")) {
+            kp.type = CustomParamType::Float;
+        } else {
+            VPU_THROW_EXCEPTION << "Scalar node has an invalid type " << typeStr;
+        }
+
+        kp.format = CustomDataFormat::Any;
+
+        kp.argName = XMLParseUtils::GetStrAttr(scalarNode, "arg-name");
+        if (kp.argName.empty()) {
+            VPU_THROW_EXCEPTION << "Scalar node has no arg-name";
+        }
+
+        kp.portIndex = -1;
+
+        kp.irSource = XMLParseUtils::GetStrAttr(scalarNode, "source", "");
+        if (kp.irSource.empty()) {
+            VPU_THROW_EXCEPTION << "Scalar node has no source";
+        }
+
+        _kernelParams.emplace_back(std::move(kp));
+    }
+}
+
+void CustomLayer::processWorkSizesNode(const pugi::xml_node & node) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    std::string nodeName(node.name());
+    if (!cmp(node.name(), "WorkSizes")) {
+        VPU_THROW_EXCEPTION << "Wrong node, expected WorkSizes found " << nodeName;
+    }
+
+    _wgDimInputIdx = -1;
+    std::string dim_src_string(node.attribute("dim").as_string(""));
+    if (!dim_src_string.empty() && !cmp(dim_src_string, "output")) {
+        // Try to locate index separator.
+        auto pos = dim_src_string.find_first_of(',');
+        auto flag = dim_src_string.substr(0, pos);
+        if (!cmp(flag, "input")) {
+            VPU_THROW_EXCEPTION << "Invalid WG dim source " << flag;
+        }
+
+        int input_idx = 0;
+        if (pos != std::string::npos) {
+            // User explicitly set input index in config.
+            auto input_idx_string = dim_src_string.substr(pos + 1, std::string::npos);
+            input_idx = std::stoi(input_idx_string);
+        }
+        if (input_idx < 0) {
+            VPU_THROW_EXCEPTION << "Invalid input tensor index " << input_idx;
+        }
+
+        _wgDimInputIdx = input_idx;
+    }
+
+    std::string gws(node.attribute("global").as_string(""));
+    while (!gws.empty()) {
+        auto pos = gws.find_first_of(',');
+        auto rule = gws.substr(0, pos);
+        if (!isLegalSizeRule(rule)) {
+            VPU_THROW_EXCEPTION << "Invalid WorkSize " << rule;
+        }
+
+        _globalSizeRules.emplace_back(std::move(rule));
+
+        if (pos == std::string::npos) {
+            gws.clear();
+        } else {
+            gws = gws.substr(pos + 1, std::string::npos);
+        }
+    }
+
+    std::string lws(node.attribute("local").as_string(""));
+    while (!lws.empty()) {
+        auto pos = lws.find_first_of(',');
+        auto rule = lws.substr(0, pos);
+        if (!isLegalSizeRule(rule)) {
+            VPU_THROW_EXCEPTION << "Invalid WorkSize " << rule;
+        }
+
+        _localSizeRules.emplace_back(std::move(rule));
+
+        if (pos == std::string::npos) {
+            lws.clear();
+        } else {
+            lws = lws.substr(pos + 1, std::string::npos);
+        }
+    }
+}
+
+bool CustomLayer::isLegalSizeRule(const std::string& rule) {
+    SimpleMathExpression expr;
+
+    expr.setVariables({
+        { 'b', 1 }, { 'B', 1 },
+        { 'f', 1 }, { 'F', 1 },
+        { 'y', 1 }, { 'Y', 1 },
+        { 'x', 1 }, { 'X', 1 },
+    });
+
+    try {
+        expr.parse(rule);
+    } catch (...) {
+        return false;
+    }
+
+    return true;
+}
+
+CustomDataFormat CustomLayer::formatFromString(const std::string & str) {
+    static const ie::details::caseless_map<std::string, CustomDataFormat> FormatNameToType = {
+        { "BFYX" , CustomDataFormat::BFYX },
+        { "BYXF" , CustomDataFormat::BYXF },
+        { "ANY"  , CustomDataFormat::Any },
+    };
+
+    auto it = FormatNameToType.find(str);
+    if (it != FormatNameToType.end()) {
+        return it->second;
+    }
+
+    return CustomDataFormat::None;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/detect_network_batch.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/detect_network_batch.cpp
new file mode 100644 (file)
index 0000000..c374aff
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <set>
+
+#include <details/caseless.hpp>
+#include <cpp/ie_cnn_network.h>
+#include <graph_tools.hpp>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+ie::CNNNetwork FrontEnd::detectNetworkBatch(
+        const ie::ICNNNetwork& origNetwork,
+        const Model::Ptr& model) {
+    VPU_PROFILE(detectNetworkBatch);
+
+    const auto& env = CompileEnv::get();
+
+    ie::details::CaselessEq<std::string> cmp;
+
+    auto batchSize = origNetwork.getBatchSize();
+
+    if (batchSize == 1 || !env.config.detectBatch) {
+        // Keep original network.
+        return ie::CNNNetwork(const_cast<ie::ICNNNetwork*>(&origNetwork));
+    }
+
+    model->setBatchSize(batchSize);
+
+    //
+    // Create a copy of the original network to reshape it.
+    //
+
+    ie::CNNNetwork reshapedNetwork(ie::CNNNetCopy(origNetwork));
+
+    auto inputsInfo = reshapedNetwork.getInputsInfo();
+    auto outputsInfo = reshapedNetwork.getOutputsInfo();
+
+    //
+    // Collect input shapes and remove batch from them.
+    //
+
+    ie::ICNNNetwork::InputShapes inputShapes;
+
+    for (const auto& p : inputsInfo) {
+        auto info = p.second;
+        IE_ASSERT(info != nullptr);
+
+        auto ieData = info->getInputData();
+        IE_ASSERT(ieData != nullptr);
+
+        inputShapes[ieData->name] = ieData->getTensorDesc().getDims();
+        switch (ieData->getLayout()) {
+            case ie::Layout::NCHW:
+            case ie::Layout::NHWC:
+            case ie::Layout::NC:
+                inputShapes[ieData->name][0] = 1;
+                break;
+            case ie::Layout::CN:
+                inputShapes[ieData->name][1] = 1;
+                break;
+            default:
+                VPU_THROW_EXCEPTION << "Unexpected input layout : " << ieData->getLayout();
+        }
+    }
+
+    //
+    // Special case for DetectionOutput.
+    //
+
+    for (const auto& layer : reshapedNetwork) {
+        if (!cmp(layer->type, "DetectionOutput"))
+            continue;
+
+        if (layer->outData.empty()) {
+            VPU_THROW_EXCEPTION << "Unsupported layer configuration for " << layer->name;
+        }
+
+        // 1. Don't support if DetectionOutput is not the last layer in network
+        if (!layer->outData.front()->getInputTo().empty()) {
+            VPU_THROW_EXCEPTION << "Unsupported configuration : layer "<< layer->name << " is not a network output";
+        }
+
+        // 2. Don't support if there multiple outputs as well
+        if (outputsInfo.size() != 1) {
+            VPU_THROW_EXCEPTION << "Unsupported configuration : layer "<< layer->name << " must be the only output of the network";
+        }
+
+        model->attrs().set<bool>("withDetectionOutput", true);
+    }
+
+    //
+    // Gathering output shapes before reshaping.
+    //
+
+    ie::ICNNNetwork::InputShapes outputShapes;
+
+    for (const auto& pair : outputsInfo) {
+        auto ieData = pair.second;
+        IE_ASSERT(ieData != nullptr);
+
+        outputShapes[pair.first] = ieData->getDims();
+    }
+
+    //
+    // Reshape the network.
+    //
+
+    reshapedNetwork.reshape(inputShapes);
+
+    //
+    // Checks outputs that doesn't change their shape.
+    //
+
+    outputsInfo = reshapedNetwork.getOutputsInfo();
+
+    for (const auto& pair : outputsInfo) {
+        auto ieData = pair.second;
+        IE_ASSERT(ieData != nullptr);
+
+        auto origShape = outputShapes[pair.first];
+        auto newShape = ieData->getDims();
+
+        if (origShape == newShape) {
+            _unbatchedOutputs.insert(ieData);
+        }
+    }
+
+    return reshapedNetwork;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp
new file mode 100644 (file)
index 0000000..14ff159
--- /dev/null
@@ -0,0 +1,392 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+typedef void (FrontEnd::*parser_t)(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs);
+
+ie::details::caseless_map<std::string, parser_t> g_parsers = {
+    {"Convolution",        &FrontEnd::parseConvolution},
+    {"Pooling",            &FrontEnd::parsePooling},
+    {"ReLU",               &FrontEnd::parseReLU},
+    {"Clamp",              &FrontEnd::parseClamp},
+    {"FullyConnected",     &FrontEnd::parseFullyConnected},
+    {"SoftMax",            &FrontEnd::parseSoftMax},
+    {"GRN",                &FrontEnd::parseGRN},
+    {"MVN",                &FrontEnd::parseMVN},
+    {"Norm",               &FrontEnd::parseNorm},
+    {"Concat",             &FrontEnd::parseConcat},
+    {"Eltwise",            &FrontEnd::parseEltwise},
+    {"Split",              &FrontEnd::parseSplit},
+    {"Sigmoid",            &FrontEnd::parseSigmoid},
+    {"TanH",               &FrontEnd::parseTanH},
+    {"PReLU",              &FrontEnd::parsePReLU},
+    {"Bias",               &FrontEnd::parseBias},
+    // Caffe Slice is transformed to Split by IE
+    {"Slice",              &FrontEnd::parseSplit},
+    {"BatchNormalization", &FrontEnd::parseBatchNorm},
+    {"ScaleShift",         &FrontEnd::parseScale},
+    {"Deconvolution",      &FrontEnd::parseDeconvolution},
+    {"Power",              &FrontEnd::parsePower},
+    {"Copy",               &FrontEnd::parseCopy},
+    {"Reshape",            &FrontEnd::parseReshape},
+    {"ELU",                &FrontEnd::parseELU},
+    // Flatten is represented as Reshape in VPU model
+    {"Flatten",            &FrontEnd::parseReshape},
+    {"Crop",               &FrontEnd::parseCrop},
+    {"Tile",               &FrontEnd::parseTile},
+    {"Normalize",          &FrontEnd::parseNormalize},
+    {"PriorBox",           &FrontEnd::parsePriorBox},
+    {"PriorBoxClustered",  &FrontEnd::parsePriorBoxClustered},
+    {"Permute",            &FrontEnd::parsePermute},
+    {"DetectionOutput",    &FrontEnd::parseDetectionOutput},
+    {"RegionYolo",         &FrontEnd::parseRegionYolo},
+    {"ReorgYolo",          &FrontEnd::parseReorgYolo},
+    {"CTCGreedyDecoder",   &FrontEnd::parseCTCDecoder},
+    {"Proposal",           &FrontEnd::parseProposal},
+    {"ROIPooling",         &FrontEnd::parseROIPooling},
+    {"PSROIPooling",       &FrontEnd::parsePSROIPooling},
+    {"Interp",             &FrontEnd::parseInterp},
+    {"Custom",             &FrontEnd::parseCustom},
+    {"MTCNN",              &FrontEnd::parseMTCNN},
+    {"LSTMCell",           &FrontEnd::parseLSTMCell},
+    {"Pad",                &FrontEnd::parsePad},
+    {"Resample",           &FrontEnd::parseResample},
+    {"ArgMax",             &FrontEnd::parseArgMax},
+    {"LSTMSequence",       &FrontEnd::parseRNN},
+};
+
+std::atomic<int> g_counter(0);
+
+}  // namespace
+
+void FrontEnd::eliminatePriorBoxData(const Model::Ptr& model) {
+    VPU_PROFILE(eliminatePriorBoxData);
+
+    auto isConvertStage = [](StageType stage) {
+        return stage == StageType::Convert_u8f16  ||
+               stage == StageType::Convert_f32f16 ||
+               stage == StageType::Convert_f16f32;
+    };
+
+    auto isPriorBox = [](std::string type) {
+        return ie::details::CaselessEq<std::string>()(type, "PriorBox") ||
+               ie::details::CaselessEq<std::string>()(type, "PriorBoxClustered");
+    };
+
+    for (const auto& data : model->datas()) {
+        if (data->usage() == DataUsage::Input) {
+            auto consumers_num = data->numConsumers();
+            bool unused = (0 == consumers_num);
+
+            // If data has consumer it still could be just data conversion stage
+            if (consumers_num == 1) {
+                auto stage = data->singleConsumer();
+                if (isConvertStage(stage->type())) {
+                    IE_ASSERT(stage->numOutputs() == 1);
+
+                    auto output = stage->output(0);
+                    if (output->numConsumers() == 0) {
+                        unused = true;
+                    }
+                }
+            }
+
+            if (unused) {
+                auto origData = data->origData();
+                IE_ASSERT(origData != nullptr);
+                IE_ASSERT(!origData->getInputTo().empty());
+
+                bool priorBox = true;
+                for (const auto& consumer_it : origData->getInputTo()) {
+                    auto consumer = consumer_it.second;
+                    priorBox &= isPriorBox(consumer->type);
+                }
+
+                if (priorBox) {
+                    if (1 == consumers_num) {
+                        model->removeStage(data->singleConsumer());
+                    }
+                    model->removeUnusedData(data);
+                }
+            }
+        }
+    }
+}
+
+Model::Ptr FrontEnd::buildInitialModel(const ie::ICNNNetwork& network) {
+    const auto& env = CompileEnv::get();
+
+    auto model = runCommonPasses(network, LayersOrder::DFS);
+
+    for (const auto& layer : _ieNetworkParser.orderedLayers) {
+        IE_ASSERT(layer != nullptr);
+
+        env.log->debug("try to parse layer %s", layer->name);
+
+        DataVector inputs, outputs;
+        getInputAndOutputData(model, layer, inputs, outputs);
+
+        if (env.netConfig.skipAllLayers() ||
+            env.netConfig.skipLayerType(layer->type)) {
+            _stageBuilder->addNoneStage(model, layer->name, layer, inputs, outputs);
+            continue;
+        }
+
+        auto it =
+                (_customLayers.count(layer->type) > 0) ?
+                    g_parsers.find("Custom") :
+                    g_parsers.find(layer->type);
+        if (it == g_parsers.end()) {
+            if (env.config.ignoreUnknownLayers) {
+                _stageBuilder->addNoneStage(model, layer->name, layer, inputs, outputs);
+                continue;
+            } else {
+                VPU_THROW_EXCEPTION
+                        << "Cannot convert layer \""
+                        << layer->name
+                        << "\" due to unsupported layer type \""
+                        << layer->type
+                        << "\"";
+            }
+        }
+
+        auto parser = it->second;
+        IE_ASSERT(parser != nullptr);
+
+        (this->*parser)(model, layer, inputs, outputs);
+    }
+
+    eliminatePriorBoxData(model);
+
+    model->cleanUpDatas();
+
+    return model;
+}
+
+std::set<std::string> FrontEnd::checkSupportedLayers(const ie::ICNNNetwork& network) {
+    const auto& env = CompileEnv::get();
+
+    auto model = runCommonPasses(network, LayersOrder::BFS);
+
+    std::set<std::string> layerNames;
+
+    for (const auto& layer : _ieNetworkParser.orderedLayers) {
+        IE_ASSERT(layer != nullptr);
+
+        env.log->debug("Try to parse layer %s", layer->name);
+
+        DataVector inputs, outputs;
+        getInputAndOutputData(model, layer, inputs, outputs);
+
+        auto it =
+                (_customLayers.count(layer->type) > 0) ?
+                    g_parsers.find("Custom") :
+                    g_parsers.find(layer->type);
+        if (it != g_parsers.end()) {
+            try {
+                // If we can create and have not thrown exception, then layer is supported.
+                auto parser = it->second;
+                IE_ASSERT(parser != nullptr);
+
+                (this->*parser)(model, layer, inputs, outputs);
+
+                layerNames.insert(layer->name);
+            } catch (const ie::details::InferenceEngineException&) {
+                // Nothing to do
+                continue;
+            }
+        }
+    }
+
+    return layerNames;
+}
+
+Model::Ptr FrontEnd::runCommonPasses(
+        const ie::ICNNNetwork& network,
+        LayersOrder order) {
+    const auto& env = CompileEnv::get();
+
+    //
+    // Load Custom layers
+    //
+
+    if (!env.config.customLayers.empty()) {
+        if (env.platform == Platform::MYRIAD_2) {
+            VPU_THROW_EXCEPTION
+                    << "Custom layers are not supported for Myriad 2 platforms";
+        }
+
+        _customLayers = CustomLayer::loadFromFile(env.config.customLayers);
+    }
+
+    //
+    // Clear Front-end state
+    //
+
+    _ieNetworkParser.clear();
+    _unbatchedOutputs.clear();
+    _ieToVpuMap.clear();
+
+    //
+    // Create new VPU model
+    //
+
+    auto model = std::make_shared<Model>(network.getName());
+
+    if (!env.config.ignoreIRStatistic) {
+        InferenceEngine::ICNNNetworkStats* stats = nullptr;
+        if (InferenceEngine::StatusCode::OK == network.getStats(&stats, nullptr) && !stats->isEmpty()) {
+            model->setNodesStats(stats->getNodesStats());
+        }
+    }
+
+    model->attrs().set<int>("index", g_counter.fetch_add(1));
+    model->attrs().set<Resources>("resources", env.resources);
+
+    //
+    // Detect network batch
+    //
+
+    auto reshapedNetwork = detectNetworkBatch(network, model);
+
+    //
+    // Remove constant layers from network
+    //
+
+    RemoveConstLayers(reshapedNetwork);
+
+    //
+    // Get IE layers in topological order
+    //
+
+    if (order == LayersOrder::DFS) {
+        _ieNetworkParser.parseNetworkDFS(reshapedNetwork);
+    } else {
+        _ieNetworkParser.parseNetworkBFS(reshapedNetwork);
+    }
+
+    //
+    // Parse network inputs/outputs/const datas
+    //
+
+    parseInputAndOutputData(model);
+
+    //
+    // Add data type convert stages
+    //
+
+    addDataTypeConvertStages(model);
+
+    //
+    // Add pre-process stages
+    //
+
+    addPreProcessStages(model);
+
+    return model;
+}
+
+Data FrontEnd::getVpuData(const ie::DataPtr& ieData) {
+    IE_ASSERT(ieData != nullptr);
+
+    auto it = _ieToVpuMap.find(ieData);
+    if (it == _ieToVpuMap.end()) {
+        return nullptr;
+    }
+
+    return it->second;
+}
+
+void FrontEnd::bindData(const Data& data, const ie::DataPtr& ieData) {
+    _ieToVpuMap[ieData] = data;
+    data->setOrigData(ieData);
+}
+
+void FrontEnd::getInputAndOutputData(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        DataVector& inputs,
+        DataVector& outputs) {
+    IE_ASSERT(layer != nullptr);
+
+    inputs.resize(layer->insData.size());
+    for (size_t i = 0; i < layer->insData.size(); ++i) {
+        auto layerInput = layer->insData[i].lock();
+        IE_ASSERT(layerInput != nullptr);
+
+        inputs[i] = getVpuData(layerInput);
+        IE_ASSERT(inputs[i] != nullptr);
+        IE_ASSERT(inputs[i]->desc().type() == DataType::FP16);
+    }
+
+    outputs.resize(layer->outData.size());
+    for (size_t i = 0; i < layer->outData.size(); ++i) {
+        auto layerOutput = layer->outData[i];
+        IE_ASSERT(layerOutput != nullptr);
+
+        if (auto data = getVpuData(layerOutput)) {
+            IE_ASSERT(data->desc().type() == DataType::FP16);
+            outputs[i] = data;
+        } else {
+            DataDesc dataDesc(layerOutput->getTensorDesc());
+            dataDesc.setType(DataType::FP16);
+
+            outputs[i] = model->addNewData(
+                layerOutput->name,
+                dataDesc);
+
+            bindData(outputs[i], layerOutput);
+        }
+    }
+}
+
+std::tuple<Data, Data> FrontEnd::getWeightsAndBiases(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer) {
+    auto baseLayer = std::dynamic_pointer_cast<ie::WeightableLayer>(layer);
+    IE_ASSERT(baseLayer != nullptr);
+
+    auto origWeights = baseLayer->_weights;
+    if (origWeights == nullptr) {
+        THROW_IE_EXCEPTION << "weights are empty for layer: " << layer->name;
+    }
+
+    auto weights = model->addConstData(
+        layer->name + "@weights",
+        DataDesc({origWeights->size()}),
+        ieBlobContent(origWeights));
+
+    auto origBiases = baseLayer->_biases;
+
+    Data biases;
+    if (origBiases == nullptr) {
+        biases = model->addFakeData();
+    } else {
+        biases = model->addConstData(
+            layer->name + "@biases",
+            DataDesc({origBiases->size()}),
+            ieBlobContent(origBiases));
+    }
+
+    return std::make_tuple(weights, biases);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/in_out_convert.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/in_out_convert.cpp
new file mode 100644 (file)
index 0000000..88e656e
--- /dev/null
@@ -0,0 +1,350 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <string>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ConvertStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ConvertStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inputScale = inputScales.at(input);
+
+        DataMap<float> out;
+
+        if (_type == StageType::Convert_f16f32) {
+            IE_ASSERT(output->usage() == DataUsage::Output);
+            IE_ASSERT(step == ScalePropagationStep::Propagate);
+
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        } else {
+            IE_ASSERT(input->usage() == DataUsage::Input);
+
+            out[output] = inputScale;
+
+            if (step == ScalePropagationStep::ScaleInput) {
+                attrs().get<float>("scale") *= inputScale;
+                attrs().get<float>("bias") *= inputScale;
+            }
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        if (_type == StageType::Convert_f16f32) {
+            IE_ASSERT(output->usage() == DataUsage::Output);
+
+            auto outDimsOrder = output->desc().dimsOrder();
+
+            // HCW is not supported
+            IE_ASSERT(outDimsOrder.dimInd(Dim::C) != 1);
+
+            out[input] = outDimsOrder;
+        } else {
+            IE_ASSERT(input->usage() == DataUsage::Input);
+
+            auto inDimsOrder = input->desc().dimsOrder();
+
+            // HCW is not supported
+            IE_ASSERT(inDimsOrder.dimInd(Dim::C) != 1);
+
+            out[output] = inDimsOrder;
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inDimsOrder = input->desc().dimsOrder();
+
+        StridesRequirement reqs;
+
+        if (input->desc().dim(Dim::N, 1) > 1) {
+            // To merge batch into previous dimension.
+            reqs.add(inDimsOrder.dimInd(Dim::N), DimStride::Compact);
+        }
+
+        DataMap<StridesRequirement> out;
+
+        if (_type == StageType::Convert_f16f32) {
+            IE_ASSERT(output->usage() == DataUsage::Output);
+
+            out[input] = reqs;
+            out[output] = StridesRequirement::compact();
+        } else {
+            IE_ASSERT(input->usage() == DataUsage::Input);
+
+            out[input] = StridesRequirement::compact();
+            out[output] = reqs;
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        // Convert will support batch by merging it with previous dimension.
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        // TODO: more SHAVEs leads to hang on public MTCNN network with U8 input
+        return StageSHAVEsRequirements::TwoOrOne;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto scale = attrs().get<float>("scale");
+        auto bias = attrs().get<float>("bias");
+        auto convertFromDetOutput = attrs().getOrDefault<bool>("convertFromDetOutput", false);
+        auto haveBatch = attrs().getOrDefault<bool>("haveBatch", true);
+
+        serializer.append(static_cast<float>(scale));
+        serializer.append(static_cast<float>(bias));
+        serializer.append(static_cast<int32_t>(convertFromDetOutput));
+        serializer.append(static_cast<int32_t>(haveBatch));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        if (input->desc().dimsOrder() == DimsOrder::NC) {
+            input->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::HWC,
+                {
+                    {Dim::W, {Dim::N}},
+                    {Dim::C, {Dim::C}}
+                });
+
+            output->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::HWC,
+                {
+                    {Dim::W, {Dim::N}},
+                    {Dim::C, {Dim::C}}
+                });
+        } else if (input->desc().dim(Dim::N, 1) > 1) {
+            auto perm = input->desc().dimsOrder().toPermutation();
+            IE_ASSERT(perm.size() == 4);
+
+            auto batchDimInd = input->desc().dimsOrder().dimInd(Dim::N);
+            IE_ASSERT(batchDimInd == perm.size() - 1);
+
+            input->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::HWC,
+                {
+                    {Dim::H, {perm[2], perm[3]}},
+                    {Dim::W, {perm[1]}},
+                    {Dim::C, {perm[0]}}
+                });
+
+            output->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::HWC,
+                {
+                    {Dim::H, {perm[2], perm[3]}},
+                    {Dim::W, {perm[1]}},
+                    {Dim::C, {perm[0]}}
+                });
+        } else {
+            input->serializeOldBuffer(handle_from_this(), serializer);
+
+            output->serializeOldBuffer(handle_from_this(), serializer);
+        }
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::createConvertStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const Data& input,
+        const Data& output,
+        StageType type,
+        float scale,
+        float bias) {
+    auto stage = model->addNewStage<ConvertStage>(
+        name,
+        type,
+        nullptr,
+        {input},
+        {output});
+
+    stage->attrs().set("scale", scale);
+    stage->attrs().set("bias", bias);
+
+    return stage;
+}
+
+void FrontEnd::addDataTypeConvertStages(const Model::Ptr& model) {
+    VPU_PROFILE(addDataTypeConvertStages);
+
+    const auto& env = CompileEnv::get();
+
+    if (env.config.inputScale != 1.f) {
+        env.log->warning("[VPU] GraphTransformer : INPUT_NORM option is deprecated");
+    }
+
+    if (env.config.inputBias != 0.f) {
+        env.log->warning("[VPU] GraphTransformer : INPUT_BIAS option is deprecated");
+    }
+
+    for (const auto& input : model->datas()) {
+        if (input->usage() != DataUsage::Input)
+            continue;
+
+        if (input->desc().type() != DataType::FP16) {
+            env.log->debug("convert input %s to FP16", input->name());
+
+            auto fp16Desc = input->desc();
+            fp16Desc.setType(DataType::FP16);
+
+            auto inputFP16 = model->duplicateData(
+                input,
+                "@FP16",
+                fp16Desc);
+
+            input->attrs().set<Data>("fp16_copy", inputFP16);
+
+            bindData(inputFP16, input->origData());
+
+            auto stageType = StageType::None;
+            switch (input->desc().type()) {
+            case DataType::U8:
+                stageType = StageType::Convert_u8f16;
+                break;
+            case DataType::FP32:
+                stageType = StageType::Convert_f32f16;
+                break;
+            default:
+                VPU_THROW_EXCEPTION << "Unsupported input data type : " << input->desc().type();
+            }
+
+            _stageBuilder->createConvertStage(
+                model,
+                inputFP16->name(),
+                input,
+                inputFP16,
+                stageType,
+                env.config.inputScale,
+                env.config.inputBias);
+        } else if (env.config.inputScale != 1.0f || env.config.inputBias != 0.0f) {
+            std::ostringstream postfixOstr;
+            if (env.config.inputScale != 1.0f) {
+                postfixOstr << "@SCALE=" << std::to_string(env.config.inputScale);
+            }
+            if (env.config.inputBias != 0.0f) {
+                postfixOstr << "@BIAS=" << std::to_string(env.config.inputBias);
+            }
+
+            auto postfix = postfixOstr.str();
+
+            auto scaledInput = model->duplicateData(
+                input,
+                postfix);
+
+            bindData(scaledInput, input->origData());
+
+            _stageBuilder->addPowerStage(
+                model,
+                scaledInput->name(),
+                nullptr,
+                env.config.inputScale,
+                1.0f,
+                env.config.inputBias,
+                input,
+                scaledInput);
+        }
+    }
+
+    for (const auto& output : model->datas()) {
+        if (output->usage() != DataUsage::Output)
+            continue;
+
+        if (output->desc().type() != DataType::FP16) {
+            env.log->debug("convert output %s from FP16", output->name());
+
+            IE_ASSERT(output->desc().type() == DataType::FP32);
+
+            auto fp16Desc = output->desc();
+            fp16Desc.setType(DataType::FP16);
+
+            auto outputFP16 = model->duplicateData(
+                output,
+                "@FP16",
+                fp16Desc);
+
+            output->attrs().set<Data>("fp16_copy", outputFP16);
+
+            bindData(outputFP16, output->origData());
+
+            auto stage = _stageBuilder->createConvertStage(
+                model,
+                outputFP16->name(),
+                outputFP16,
+                output,
+                StageType::Convert_f16f32);
+
+            auto withDetectionOutput = model->attrs().getOrDefault<bool>("withDetectionOutput", false);
+            stage->attrs().set<bool>("convertFromDetOutput", withDetectionOutput);
+
+            auto haveBatch = _unbatchedOutputs.count(output->origData()) == 0;
+            stage->attrs().set<bool>("haveBatch", haveBatch);
+        }
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/parse_data.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/parse_data.cpp
new file mode 100644 (file)
index 0000000..ae9b86f
--- /dev/null
@@ -0,0 +1,144 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <algorithm>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+void FrontEnd::parseInputAndOutputData(const Model::Ptr& model) {
+    VPU_PROFILE(parseInputAndOutputData);
+
+    const auto& env = CompileEnv::get();
+
+    //
+    // Parse network inputs
+    //
+
+    for (const auto& inputInfo : _ieNetworkParser.networkInputs) {
+        auto netInput = inputInfo.second;
+        IE_ASSERT(netInput != nullptr);
+
+        auto ieData = netInput->getInputData();
+        IE_ASSERT(ieData != nullptr);
+
+        DataDesc vpuDesc(ieData->getTensorDesc());
+        if (vpuDesc.numDims() >= 3) {
+            if (env.config.hwOptimization || env.config.forceLayout == ComputeLayout::NCHW) {
+                vpuDesc.moveDim(Dim::C, 2);
+            } else {
+                vpuDesc.moveDim(Dim::C, 0);
+            }
+        }
+
+        auto vpuData = model->addInputData(ieData->getName(), vpuDesc);
+        bindData(vpuData, ieData);
+    }
+
+    model->attrs().set<int>("numInputs", _ieNetworkParser.networkInputs.size());
+
+    //
+    // Parse network outputs
+    //
+
+    for (const auto& outputInfo : _ieNetworkParser.networkOutputs) {
+        auto ieData = outputInfo.second;
+        IE_ASSERT(ieData != nullptr);
+
+        DataDesc vpuDesc(ieData->getTensorDesc());
+        if (vpuDesc.numDims() >= 3) {
+            if (env.config.hwOptimization || env.config.forceLayout == ComputeLayout::NCHW) {
+                vpuDesc.moveDim(Dim::C, 2);
+            } else {
+                vpuDesc.moveDim(Dim::C, 0);
+            }
+        }
+
+        auto vpuData = model->addOutputData(ieData->getName(), vpuDesc);
+        bindData(vpuData, ieData);
+
+        if (_unbatchedOutputs.count(ieData) > 0) {
+            vpuData->attrs().set<bool>("unbatched", true);
+        }
+    }
+
+    model->attrs().set<int>("numOutputs", _ieNetworkParser.networkOutputs.size());
+
+    //
+    // Parse constant data
+    //
+
+    for (const auto& constInfo : _ieNetworkParser.constDatas) {
+        auto ieData = constInfo.first;
+        IE_ASSERT(ieData != nullptr);
+
+        auto ieBlob = constInfo.second;
+        IE_ASSERT(ieBlob != nullptr);
+
+        DataDesc vpuDesc(ieData->getTensorDesc());
+        vpuDesc.setType(DataType::FP16);
+
+        auto vpuData = model->addConstData(
+            ieData->getName(),
+            vpuDesc,
+            ieBlobContent(ieBlob));
+
+        // User might ask to return the output from Const layer.
+        if (auto vpuOutData = getVpuData(ieData)) {
+            IE_ASSERT(vpuOutData->usage() == DataUsage::Output);
+
+            _stageBuilder->addCopyStage(
+                model,
+                formatString("%s@return-const", vpuData->name()),
+                nullptr,
+                vpuData,
+                vpuOutData);
+        }
+
+        bindData(vpuData, ieData);
+    }
+
+    //
+    // Add Copy stages after network outputs, if they are in the middle
+    //
+
+    for (const auto& outputInfo : _ieNetworkParser.networkOutputs) {
+        auto ieData = outputInfo.second;
+        IE_ASSERT(ieData != nullptr);
+
+        auto vpuData = getVpuData(ieData);
+        IE_ASSERT(vpuData != nullptr);
+
+        // It might be Const.
+        if (vpuData->usage() != DataUsage::Output)
+            continue;
+
+        // Convert stage will be added.
+        if (vpuData->desc().type() != DataType::FP16)
+            continue;
+
+        if (!ieData->getInputTo().empty()) {
+            auto vpuTempData = model->duplicateData(
+                vpuData,
+                "@intermediate",
+                vpuData->desc());
+
+            _stageBuilder->addCopyStage(
+                model,
+                formatString("%s@copy-to-output", vpuData->name()),
+                nullptr,
+                vpuTempData,
+                vpuData);
+
+            bindData(vpuTempData, ieData);
+        }
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/parse_network.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/parse_network.cpp
new file mode 100644 (file)
index 0000000..d36e21b
--- /dev/null
@@ -0,0 +1,413 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <set>
+#include <list>
+#include <unordered_set>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <algorithm>
+#include <utility>
+
+#include <cpp/ie_cnn_network.h>
+#include <graph_tools.hpp>
+#include <details/caseless.hpp>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+void runDFS(
+        const ie::CNNLayerPtr& layer,
+        std::vector<ie::CNNLayerPtr>& out,
+        std::unordered_map<ie::CNNLayerPtr, bool>& visitedMap) {
+    visitedMap[layer] = false;
+
+    std::vector<ie::CNNLayerPtr> nextLayers;
+    for (const auto& output : layer->outData) {
+        IE_ASSERT(output != nullptr);
+
+        for (const auto& consumer : output->getInputTo()) {
+            auto nextLayer = consumer.second;
+            IE_ASSERT(nextLayer != nullptr);
+
+            nextLayers.emplace_back(nextLayer);
+        }
+    }
+
+    std::sort(nextLayers.begin(), nextLayers.end(),
+              [](const ie::CNNLayerPtr& left, const ie::CNNLayerPtr& right) {
+        ie::details::CaselessLess<std::string> cmp;
+        return cmp(left->name, right->name);
+    });
+
+    for (const auto& nextLayer : nextLayers) {
+        auto it = visitedMap.find(nextLayer);
+
+        if (it != visitedMap.end()) {
+            auto visited = it->second;
+
+            if (!visited) {
+                VPU_THROW_EXCEPTION << "The graph has a loop";
+            }
+
+            continue;
+        }
+
+        runDFS(nextLayer, out, visitedMap);
+    }
+
+    visitedMap[layer] = true;
+
+    out.emplace_back(layer);
+}
+
+}  // namespace
+
+void IeNetworkParser::clear() {
+    networkInputs.clear();
+    networkOutputs.clear();
+    constDatas.clear();
+    orderedLayers.clear();
+}
+
+void IeNetworkParser::checkNetwork(const ie::CNNNetwork& network) {
+    const auto& env = CompileEnv::get();
+
+    auto networkPrecision = network.getPrecision();
+    if (networkPrecision != ie::Precision::FP16) {
+        if (networkPrecision != ie::Precision::FP32 || !env.config.allowFP32Models) {
+            VPU_THROW_EXCEPTION << "Unsupported network precision : " << networkPrecision;
+        }
+    }
+
+    networkInputs = network.getInputsInfo();
+    networkOutputs = network.getOutputsInfo();
+
+    if (networkInputs.empty()) {
+        VPU_THROW_EXCEPTION << "No inputs detected in network " << network.getName();
+    }
+    if (networkOutputs.empty()) {
+        VPU_THROW_EXCEPTION << "No outputs detected in network " << network.getName();
+    }
+
+    for (const auto& netInput : networkInputs) {
+        auto inputInfo = netInput.second;
+        IE_ASSERT(inputInfo != nullptr);
+
+        auto inputPrecision = inputInfo->getInputPrecision();
+
+        if (inputPrecision != ie::Precision::U8 &&
+            inputPrecision != ie::Precision::FP16 &&
+            inputPrecision != ie::Precision::FP32) {
+            THROW_IE_EXCEPTION << "[PARAMETER_MISMATCH] Unsupported input precision: " << inputPrecision.name() << "!";
+        }
+    }
+
+    for (const auto& netOutput : networkOutputs) {
+        auto outputData = netOutput.second;
+        IE_ASSERT(outputData != nullptr);
+
+        auto outputPrecision = outputData->getPrecision();
+
+        if (outputPrecision != ie::Precision::FP16 &&
+            outputPrecision != ie::Precision::FP32) {
+            THROW_IE_EXCEPTION << "[PARAMETER_MISMATCH] Unsupported output precision: " << outputPrecision.name() << "!";
+        }
+    }
+}
+
+void IeNetworkParser::parseNetworkDFS(const ie::CNNNetwork& network) {
+    VPU_PROFILE(parseNetworkDFS);
+
+    const auto& env = CompileEnv::get();
+
+    ie::details::CaselessEq<std::string> cmp;
+
+    env.log->debug("parse network %s", network.getName());
+
+    //
+    // Check network inputs and outputs.
+    //
+
+    checkNetwork(network);
+
+    //
+    // Collect all network input data.
+    //
+
+    std::unordered_set<ie::DataPtr> allInputDatas;
+
+    for (const auto& netInput : networkInputs) {
+        auto inputInfo = netInput.second;
+        IE_ASSERT(inputInfo != nullptr);
+
+        auto inputData = inputInfo->getInputData();
+        IE_ASSERT(inputData != nullptr);
+
+        allInputDatas.insert(inputData);
+    }
+
+    //
+    // Collect all network const data.
+    //
+
+    for (const auto& layer : ie::CNNNetGetAllInputLayers(network)) {
+        IE_ASSERT(layer != nullptr);
+
+        if (!cmp(layer->type, "Const"))
+            continue;
+
+        if (layer->outData.size() != 1) {
+            VPU_THROW_EXCEPTION
+                    << "Const layer " << layer->name
+                    << " has unsupported number of outputs "
+                    << layer->outData.size();
+        }
+
+        if (layer->blobs.size() != 1) {
+            VPU_THROW_EXCEPTION
+                    << "Const layer " << layer->name
+                    << " has unsupported number of blobs "
+                    << layer->blobs.size();
+        }
+
+        auto constData = layer->outData[0];
+        IE_ASSERT(constData != nullptr);
+
+        auto constBlob = layer->blobs.begin()->second;
+        IE_ASSERT(constBlob != nullptr);
+
+        constDatas[constData] = constBlob;
+
+        allInputDatas.insert(constData);
+    }
+
+    //
+    // Collect initial layers.
+    //
+
+    std::unordered_set<ie::CNNLayerPtr> visitedInitialLayers;
+    std::vector<ie::CNNLayerPtr> initialLayers;
+
+    for (const auto& inputData : allInputDatas) {
+        for (const auto& consumer : inputData->getInputTo()) {
+            auto initialLayer = consumer.second;
+            IE_ASSERT(initialLayer != nullptr);
+
+            if (visitedInitialLayers.count(initialLayer) > 0)
+                continue;
+
+            bool allInputsAvailable = true;
+            for (const auto& in : initialLayer->insData) {
+                auto input = in.lock();
+                IE_ASSERT(input != nullptr);
+
+                if (allInputDatas.count(input) == 0) {
+                    allInputsAvailable = false;
+                    break;
+                }
+            }
+
+            if (allInputsAvailable) {
+                visitedInitialLayers.insert(initialLayer);
+                initialLayers.emplace_back(std::move(initialLayer));
+            }
+        }
+    }
+
+    IE_ASSERT(!initialLayers.empty());
+
+    //
+    // Run recursive DFS algorithm.
+    //
+
+    std::sort(initialLayers.begin(), initialLayers.end(),
+              [](const ie::CNNLayerPtr& left, const ie::CNNLayerPtr& right) {
+        ie::details::CaselessLess<std::string> cmp;
+        return cmp(left->name, right->name);
+    });
+
+    std::unordered_map<ie::CNNLayerPtr, bool> visitedMap;
+    for (const auto& layer : initialLayers) {
+        runDFS(layer, orderedLayers, visitedMap);
+    }
+
+    //
+    // Reverse the result.
+    //
+
+    std::reverse(orderedLayers.begin(), orderedLayers.end());
+}
+
+void IeNetworkParser::parseNetworkBFS(const ie::CNNNetwork& network) {
+    VPU_PROFILE(parseNetworkBFS);
+
+    const auto& env = CompileEnv::get();
+
+    ie::details::CaselessEq<std::string> cmp;
+
+    env.log->debug("parse network %s", network.getName());
+
+    //
+    // Check network inputs and outputs.
+    //
+
+    checkNetwork(network);
+
+    //
+    // Collect input datas.
+    //
+
+    std::unordered_set<ie::DataPtr> availableData;
+
+    for (const auto& netInput : networkInputs) {
+        auto inputInfo = netInput.second;
+        IE_ASSERT(inputInfo != nullptr);
+
+        auto inputData = inputInfo->getInputData();
+        IE_ASSERT(inputData != nullptr);
+
+        availableData.insert(inputData);
+    }
+
+    //
+    // Collect all network const data.
+    //
+
+    for (const auto& layer : ie::CNNNetGetAllInputLayers(network)) {
+        IE_ASSERT(layer != nullptr);
+
+        if (!cmp(layer->type, "Const"))
+            continue;
+
+        if (layer->outData.size() != 1) {
+            VPU_THROW_EXCEPTION
+                    << "Const layer " << layer->name
+                    << " has unsupported number of outputs "
+                    << layer->outData.size();
+        }
+
+        if (layer->blobs.size() != 1) {
+            VPU_THROW_EXCEPTION
+                    << "Const layer " << layer->name
+                    << " has unsupported number of blobs "
+                    << layer->blobs.size();
+        }
+
+        auto constData = layer->outData[0];
+        IE_ASSERT(constData != nullptr);
+
+        auto constBlob = layer->blobs.begin()->second;
+        IE_ASSERT(constBlob != nullptr);
+
+        constDatas[constData] = constBlob;
+
+        availableData.insert(constData);
+    }
+
+    //
+    // Collect initial layers.
+    //
+
+    std::unordered_set<ie::CNNLayerPtr> visitedInitialLayers;
+    std::list<ie::CNNLayerPtr> layersToHandle;
+
+    for (const auto& inputData : availableData) {
+        for (const auto& consumer : inputData->getInputTo()) {
+            auto initialLayer = consumer.second;
+            IE_ASSERT(initialLayer != nullptr);
+
+            if (visitedInitialLayers.count(initialLayer) > 0)
+                continue;
+
+            bool allInputsAvailable = true;
+            for (const auto& in : initialLayer->insData) {
+                auto input = in.lock();
+                IE_ASSERT(input != nullptr);
+
+                if (availableData.count(input) == 0) {
+                    allInputsAvailable = false;
+                    break;
+                }
+            }
+
+            if (allInputsAvailable) {
+                visitedInitialLayers.insert(initialLayer);
+                layersToHandle.emplace_back(std::move(initialLayer));
+            }
+        }
+    }
+
+    IE_ASSERT(!layersToHandle.empty());
+
+    //
+    // Traversing the topology (BFS).
+    //
+
+    std::unordered_set<ie::CNNLayerPtr> parsedLayers;
+
+    size_t loopTracker = 0;
+
+    while (!layersToHandle.empty()) {
+        auto layer = layersToHandle.front();
+
+        if (layersToHandle.size() == loopTracker) {
+            VPU_THROW_EXCEPTION
+                    << "Inputs for layer " << layer->name
+                    << "(and " << loopTracker - 1 << " more layers) can not be computed";
+        }
+
+        layersToHandle.pop_front();
+
+        bool allInputsAvailable = true;
+        for (const auto& in : layer->insData) {
+            auto inData = in.lock();
+            IE_ASSERT(inData != nullptr);
+
+            if (availableData.find(inData) == availableData.end()) {
+                allInputsAvailable = false;
+                break;
+            }
+        }
+
+        if (!allInputsAvailable) {
+            layersToHandle.emplace_back(std::move(layer));
+            loopTracker++;
+            continue;
+        }
+
+        if (parsedLayers.find(layer) == parsedLayers.end()) {
+            orderedLayers.emplace_back(layer);
+            parsedLayers.insert(layer);
+        }
+
+        // Add children to the list to verify.
+        for (const auto& out : layer->outData) {
+            IE_ASSERT(out != nullptr);
+            availableData.insert(out);
+
+            // New data added -> have to reset loop tracking.
+            loopTracker = 0;
+
+            for (const auto& layerInfo : out->inputTo) {
+                auto consumer = layerInfo.second;
+                IE_ASSERT(consumer != nullptr);
+
+                auto it = std::find(layersToHandle.begin(), layersToHandle.end(), consumer);
+                if (it == layersToHandle.end()) {
+                    layersToHandle.emplace_back(std::move(consumer));
+                }
+            }
+        }
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/pre_process.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/pre_process.cpp
new file mode 100644 (file)
index 0000000..42cde67
--- /dev/null
@@ -0,0 +1,194 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+
+#include <details/caseless.hpp>
+#include <cpp/ie_cnn_network.h>
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
+#include <vpu/sw/utility.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+class MeanImageContent final : public CalculatedDataContent {
+public:
+    explicit MeanImageContent(const ie::PreProcessInfo& info) : _info(info) {}
+
+protected:
+    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
+        auto countElem = _desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C);
+
+        if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+            countElem *= 2;
+        }
+
+        return countElem * sizeof(fp16_t);
+    }
+
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
+        VPU_PROFILE(MeanImageContent);
+
+        auto numOfChannel = _info.getNumberOfChannels();
+
+        auto imagePixels = _desc.dim(Dim::W) * _desc.dim(Dim::H);
+        auto countElem = _desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C);
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+        auto dstPtr2 = dstPtr;
+
+        if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+            dstPtr2 += countElem;
+        }
+
+        ie::parallel_for(numOfChannel, [=](int i) {
+            auto meanDataBlob = _info[i]->meanData;
+
+            ie::PrecisionUtils::f32tof16Arrays(
+                dstPtr2 + i * imagePixels,
+                meanDataBlob->buffer().as<const float*>(),
+                imagePixels,
+                -1.0f);
+        });
+
+        if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+            kchw_to_hwck(dstPtr2, dstPtr, _desc);
+        }
+    }
+
+private:
+    ie::PreProcessInfo _info;
+};
+
+class MeanValueContent final : public CalculatedDataContent {
+public:
+    explicit MeanValueContent(const ie::PreProcessInfo& info) : _info(info) {}
+
+protected:
+    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
+        return _info.getNumberOfChannels() * sizeof(fp16_t);
+    }
+
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
+        VPU_PROFILE(MeanValueContent);
+
+        IE_ASSERT(_desc.totalDimSize() == _info.getNumberOfChannels());
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+        ie::parallel_for(_info.getNumberOfChannels(), [dstPtr, this](int i) {
+            dstPtr[i] = ie::PrecisionUtils::f32tof16(-_info[i]->meanValue);
+        });
+    }
+
+private:
+    ie::PreProcessInfo _info;
+};
+
+}  // namespace
+
+void FrontEnd::addPreProcessStages(const Model::Ptr& model) {
+    VPU_PROFILE(addPreProcessStages);
+
+    const auto& env = CompileEnv::get();
+
+    for (const auto& inputInfo : _ieNetworkParser.networkInputs) {
+        auto netInput = inputInfo.second;
+        IE_ASSERT(netInput != nullptr);
+
+        auto ieData = netInput->getInputData();
+        IE_ASSERT(ieData != nullptr);
+
+        const auto& preProcess = netInput->getPreProcess();
+
+        if (preProcess.getMeanVariant() != ie::NONE) {
+            auto input = getVpuData(ieData);
+            IE_ASSERT(input != nullptr);
+            IE_ASSERT(input->desc().type() == DataType::FP16);
+
+            int numOfChannel = preProcess.getNumberOfChannels();
+
+            env.log->debug("add pre-processing for input %s", input->name());
+
+            if (preProcess.getMeanVariant() == ie::MEAN_IMAGE) {
+                auto meanImage = model->addConstData(
+                    input->name() + "@mean-image",
+                    input->desc(),
+                    std::make_shared<MeanImageContent>(preProcess));
+
+                auto newInput = model->duplicateData(
+                    input,
+                    "@after-mean-image");
+
+                bindData(newInput, ieData);
+
+                _stageBuilder->addSumStage(
+                    model,
+                    meanImage->name(),
+                    nullptr,
+                    input, meanImage,
+                    newInput);
+
+                input = newInput;
+            } else {
+                auto meanValues = model->addConstData(
+                    input->name() + "@mean-values",
+                    DataDesc({numOfChannel}),
+                    std::make_shared<MeanValueContent>(preProcess));
+
+                auto newInput = model->duplicateData(
+                    input,
+                    "@after-mean-values");
+
+                bindData(newInput, ieData);
+
+                _stageBuilder->addBiasStage(
+                    model,
+                    meanValues->name(),
+                    nullptr,
+                    input, meanValues,
+                    newInput);
+
+                input = newInput;
+            }
+
+            if (preProcess[0]->stdScale != 1.0f) {
+                for (int i = 1; i < numOfChannel; i++) {
+                    if (!isFloatEqual(preProcess[i - 1]->stdScale, preProcess[i]->stdScale)) {
+                        VPU_THROW_EXCEPTION << "Different values of stdScale are not supported";
+                    }
+                }
+
+                auto newInput = model->duplicateData(
+                    input,
+                    "@after-std-scale");
+
+                bindData(newInput, ieData);
+
+                _stageBuilder->addPowerStage(
+                    model,
+                    input->name() + "@stdScale=" + std::to_string(preProcess[0]->stdScale),
+                    nullptr,
+                    preProcess[0]->stdScale,
+                    1.0f,
+                    0.0f,
+                    input,
+                    newInput);
+
+                input = newInput;
+            }
+        }
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/remove_const_layers.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/remove_const_layers.cpp
new file mode 100644 (file)
index 0000000..01f41e0
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+#include <cnn_network_impl.hpp>
+#include <graph_transformer.h>
+
+namespace vpu {
+
+void FrontEnd::RemoveConstLayers(ie::ICNNNetwork &network) {
+    VPU_PROFILE(RemoveConstLayers);
+    auto *implNetwork = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(&network);
+    if (implNetwork) {
+        // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
+        InferenceEngine::ConstTransformer transformator(implNetwork);
+        transformator.fullTrim();
+    }
+}
+
+}  // namespace vpu
\ No newline at end of file
diff --git a/inference-engine/src/vpu/graph_transformer/src/graph_transformer.cpp b/inference-engine/src/vpu/graph_transformer/src/graph_transformer.cpp
new file mode 100644 (file)
index 0000000..c9bfe0a
--- /dev/null
@@ -0,0 +1,229 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/graph_transformer.hpp>
+
+#include <climits>
+#include <cstring>
+
+#include <string>
+#include <memory>
+#include <list>
+#include <vector>
+#include <array>
+#include <unordered_set>
+#include <set>
+#include <unordered_map>
+#include <fstream>
+#include <utility>
+#include <algorithm>
+#include <map>
+#include <streambuf>
+#include <tuple>
+#include <sstream>
+#include <iomanip>
+#include <atomic>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
+#include <vpu/parsed_config.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/frontend/stage_builder.hpp>
+#include <vpu/frontend/frontend.hpp>
+#include <vpu/pass_manager.hpp>
+#include <vpu/backend/backend.hpp>
+#include <vpu/allocator.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+
+namespace vpu {
+
+//
+// CompileEnv
+//
+
+namespace  {
+
+thread_local CompileEnv *g_compileEnv = nullptr;
+
+}  // namespace
+
+const CompileEnv& CompileEnv::get() {
+    IE_ASSERT(g_compileEnv != nullptr);
+    IE_ASSERT(g_compileEnv->initialized);
+
+    return *g_compileEnv;
+}
+
+void CompileEnv::init(
+        Platform platform,
+        const CompilationConfig& config,
+        const Logger::Ptr& log) {
+    IE_ASSERT(g_compileEnv == nullptr);
+    g_compileEnv = new CompileEnv();
+
+    g_compileEnv->platform = platform;
+    g_compileEnv->config = config;
+    g_compileEnv->log = log;
+
+    if (g_compileEnv->platform == Platform::MYRIAD_2) {
+        g_compileEnv->config.hwOptimization = false;
+    }
+
+    if (g_compileEnv->config.numSHAVEs > g_compileEnv->config.numCMXSlices) {
+        VPU_THROW_EXCEPTION
+                << "Invalid config value for VPU_NUMBER_OF_SHAVES. "
+                << "It is expected that the number of shaves is less than number of CMX slices";
+    }
+
+    if ((g_compileEnv->config.numSHAVEs == -1) && (g_compileEnv->config.numCMXSlices == -1)) {
+        if (g_compileEnv->platform == Platform::MYRIAD_2) {
+            g_compileEnv->resources.numCMXSlices = 12;
+            g_compileEnv->resources.numSHAVEs = 12;
+            g_compileEnv->resources.cmxLimit = 0;
+        } else {
+            if (g_compileEnv->config.hwOptimization) {
+                g_compileEnv->resources.numCMXSlices = 9;
+                g_compileEnv->resources.numSHAVEs = 7;
+                g_compileEnv->resources.cmxLimit = (g_compileEnv->resources.numCMXSlices / 2) * CMX_SLICE_SIZE + CMX_SLICE_SIZE / 2;
+            } else {
+                g_compileEnv->resources.numCMXSlices = 16;
+                g_compileEnv->resources.numSHAVEs = 16;
+                g_compileEnv->resources.cmxLimit = 0;
+            }
+        }
+    } else {
+        if (g_compileEnv->platform == Platform::MYRIAD_2) {
+            if ((g_compileEnv->config.numSHAVEs > 12) || (g_compileEnv->config.numSHAVEs < 1)) {
+                VPU_THROW_EXCEPTION
+                    << "Number of SHAVES should be in the range of 1 .. 12";
+            }
+
+            g_compileEnv->resources.numCMXSlices = g_compileEnv->config.numCMXSlices;
+            g_compileEnv->resources.numSHAVEs = g_compileEnv->config.numSHAVEs;
+            g_compileEnv->resources.cmxLimit = 0;
+        } else {
+            if ((g_compileEnv->config.numSHAVEs > 16) || (g_compileEnv->config.numSHAVEs < 1)) {
+                VPU_THROW_EXCEPTION
+                    << "Number of SHAVES should be in the range of 1 .. 16";
+            }
+
+            g_compileEnv->resources.numCMXSlices = g_compileEnv->config.numCMXSlices;
+            g_compileEnv->resources.numSHAVEs = g_compileEnv->config.numSHAVEs;
+            g_compileEnv->resources.cmxLimit = (g_compileEnv->resources.numCMXSlices / 2) * CMX_SLICE_SIZE + CMX_SLICE_SIZE / 2;
+        }
+    }
+
+    g_compileEnv->netConfig.parse(g_compileEnv->config);
+
+    if (g_compileEnv->netConfig.hasManualDataScale()) {
+        g_compileEnv->config.hwAdaptiveMode = false;
+    }
+
+    g_compileEnv->initialized = true;
+}
+
+void CompileEnv::updateConfig(const CompilationConfig& config) {
+    IE_ASSERT(g_compileEnv != nullptr);
+    IE_ASSERT(g_compileEnv->initialized);
+
+    g_compileEnv->config = config;
+}
+
+void CompileEnv::free() {
+    IE_ASSERT(g_compileEnv != nullptr);
+    IE_ASSERT(g_compileEnv->initialized);
+
+    delete g_compileEnv;
+    g_compileEnv = nullptr;
+}
+
+//
+// compileNetwork
+//
+
+namespace {
+
+CompiledGraph::Ptr compileImpl(const ie::ICNNNetwork& network) {
+    auto stageBuilder = std::make_shared<StageBuilder>();
+    auto frontEnd = std::make_shared<FrontEnd>(stageBuilder);
+    auto backEnd = std::make_shared<BackEnd>();
+    auto passManager = std::make_shared<PassManager>(stageBuilder, backEnd);
+
+    auto middleEnd = passManager->buildMiddleEnd();
+
+    auto model = frontEnd->buildInitialModel(network);
+
+    AutoScope autoDumper([backEnd, model]() {
+        backEnd->dumpModel(model);
+    });
+
+    middleEnd->run(model);
+
+    return backEnd->build(model, frontEnd->allLayers());
+}
+
+}  // namespace
+
+CompiledGraph::Ptr compileNetwork(
+        const ie::ICNNNetwork& network,
+        Platform platform,
+        const CompilationConfig& config,
+        const Logger::Ptr& log) {
+    VPU_PROFILE(compileNetwork);
+
+    CompileEnv::init(platform, config, log);
+    AutoScope autoDeinit([] {
+        CompileEnv::free();
+    });
+
+    return compileImpl(network);
+}
+
+CompiledGraph::Ptr compileSubNetwork(
+        const ie::ICNNNetwork& network,
+        const CompilationConfig& subConfig) {
+    VPU_PROFILE(compileSubNetwork);
+
+    const auto& env = CompileEnv::get();
+
+    auto prevConfig = env.config;
+    AutoScope autoRecover([prevConfig]() {
+        CompileEnv::updateConfig(prevConfig);
+    });
+
+    CompileEnv::updateConfig(subConfig);
+
+    return compileImpl(network);
+}
+
+//
+// getSupportedLayers
+//
+
+std::set<std::string> getSupportedLayers(
+        const ie::ICNNNetwork& network,
+        Platform platform,
+        const CompilationConfig& config,
+        const Logger::Ptr& log) {
+    VPU_PROFILE(getSupportedLayers);
+
+    CompileEnv::init(platform, config, log);
+
+    AutoScope autoDeinit([] {
+        CompileEnv::free();
+    });
+
+    auto stageBuilder = std::make_shared<StageBuilder>();
+    auto frontEnd = std::make_shared<FrontEnd>(stageBuilder);
+
+    return frontEnd->checkSupportedLayers(network);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/hw/mx_stage.cpp b/inference-engine/src/vpu/graph_transformer/src/hw/mx_stage.cpp
new file mode 100644 (file)
index 0000000..cb3f49b
--- /dev/null
@@ -0,0 +1,251 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/mx_stage.hpp>
+
+#include <memory>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+StagePtr MyriadXHwStage::cloneImpl() const {
+    return std::make_shared<MyriadXHwStage>(*this);
+}
+
+DataMap<float> MyriadXHwStage::propagateScaleFactorsImpl(const DataMap<float>&, ScalePropagationStep) {
+    VPU_THROW_EXCEPTION << "Must never be called";
+}
+
+namespace {
+
+StridesRequirement getHwStridesRequirement(const Stage& stage, const DataDesc& desc) {
+    StridesRequirement out;
+
+    if (desc.numDims() >= 3) {
+        out.add(1, DimStride::Aligned);
+    } else {
+        IE_ASSERT(stage->attrs().get<HwOpType>("hwOpType") == HwOpType::FC);
+        IE_ASSERT(desc.dimsOrder() == DimsOrder::NC);
+
+        out.add(0, DimStride::Aligned);
+    }
+
+    if (desc.dim(Dim::N, 1) > 1) {
+        // To merge batch into previous dimension.
+        out.add(desc.dimsOrder().dimInd(Dim::N), DimStride::Compact);
+    }
+
+    return out;
+}
+
+}  // namespace
+
+DataMap<DimsOrder> MyriadXHwStage::propagateDataOrderImpl() const {
+    IE_ASSERT(_inputEdges.size() >= 4);
+    IE_ASSERT(_outputEdges.size() >= 1);
+
+    if (attrs().get<HwOpType>("hwOpType") != HwOpType::POOL) {
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto scales = _inputEdges[3]->input();
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+        IE_ASSERT(scales->usage() == DataUsage::Const || scales->usage() == DataUsage::Fake);
+    }
+
+    auto input = _inputEdges[0]->input();
+    auto output = _outputEdges[0]->output();
+
+    DataMap<DimsOrder> out;
+
+    // TODO: support HCW
+
+    if (input->desc().numDims() >= 3) {
+        out[input] = input->desc().dimsOrder().createMovedDim(Dim::C, 2);
+    } else {
+        IE_ASSERT(input->desc().dimsOrder() == DimsOrder::NC);
+    }
+
+    if (output->desc().numDims() >= 3) {
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 2);
+    } else {
+        IE_ASSERT(output->desc().dimsOrder() == DimsOrder::NC);
+    }
+
+    return out;
+}
+
+DataMap<StridesRequirement> MyriadXHwStage::getDataStridesRequirementsImpl() const {
+    IE_ASSERT(_inputEdges.size() >= 4);
+    IE_ASSERT(_outputEdges.size() >= 1);
+
+    if (attrs().get<HwOpType>("hwOpType") != HwOpType::POOL) {
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto scales = _inputEdges[3]->input();
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+        IE_ASSERT(scales->usage() == DataUsage::Const || scales->usage() == DataUsage::Fake);
+    }
+
+    auto input = _inputEdges[0]->input();
+    auto output = _outputEdges[0]->output();
+
+    DataMap<StridesRequirement> out;
+
+    out[input] = getHwStridesRequirement(handle_from_this(), input->desc());
+    out[output] = getHwStridesRequirement(handle_from_this(), output->desc());
+
+    return out;
+}
+
+void MyriadXHwStage::finalizeDataLayoutImpl() {
+}
+
+DataMap<BatchSupport> MyriadXHwStage::getBatchSupportInfoImpl() const {
+    DataMap<BatchSupport> out;
+
+    if (attrs().get<HwOpType>("hwOpType") != HwOpType::POOL) {
+        IE_ASSERT(_inputEdges.size() >= 4);
+        IE_ASSERT(_outputEdges.size() >= 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto scales = _inputEdges[3]->input();
+        auto output = _outputEdges[0]->output();
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+        IE_ASSERT(scales->usage() == DataUsage::Const || scales->usage() == DataUsage::Fake);
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+    }
+
+    return out;
+}
+
+void MyriadXHwStage::finalCheckImpl() const {
+    IE_ASSERT(_inputEdges.size() >= 4);
+    IE_ASSERT(_outputEdges.size() >= 1);
+
+    auto input = _inputEdges[0]->input();
+    auto weights = _inputEdges[1]->input();
+    auto biases = _inputEdges[2]->input();
+    auto scales = _inputEdges[3]->input();
+    auto output = _outputEdges[0]->output();
+
+    IE_ASSERT(input->memoryOffset() % 16 == 0);
+    IE_ASSERT(weights->memoryOffset() % 16 == 0);
+    IE_ASSERT(biases->memoryOffset() % 16 == 0);
+    IE_ASSERT(scales->memoryOffset() % 16 == 0);
+    IE_ASSERT(output->memoryOffset() % 16 == 0);
+}
+
+void MyriadXHwStage::serializeParamsImpl(BlobSerializer& serializer) const {
+    const auto& hwOps = attrs().get<HwOpList>("hwOps");
+    IE_ASSERT(!hwOps.vec.empty());
+
+    serializer.append(checked_cast<uint32_t>(hwOps.vec.size()));
+    for (const auto& hwOpParams : hwOps.vec) {
+        serializer.append(checked_cast<uint32_t>(hwOpParams.opType));
+        if (hwOpParams.opType == HwOpType::POOL) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.poolType));
+        }
+
+        serializer.append(checked_cast<uint32_t>(hwOpParams.opMode));
+
+        serializer.append(checked_cast<uint32_t>(hwOpParams.withPad));
+        if (hwOpParams.withPad) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.padMode));
+        }
+
+        serializer.append(checked_cast<int32_t>(hwOpParams.inputInd));
+        serializer.append(checked_cast<int32_t>(hwOpParams.outputInd));
+        serializer.append(checked_cast<int32_t>(hwOpParams.coeffsInd));
+        serializer.append(checked_cast<int32_t>(hwOpParams.biasesInd));
+        serializer.append(checked_cast<int32_t>(hwOpParams.scalesInd));
+
+        if (hwOpParams.opType != HwOpType::FC) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.outChanOffset));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.outNumChans));
+        } else {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.fcInputOffset));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.fcInputNum));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.fcOutputOffset));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.fcOutputNum));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.fcAccum));
+        }
+
+        if (hwOpParams.opType != HwOpType::FC) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.kernelWidth));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.kernelHeight));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.kernelStride));
+        }
+
+        if (hwOpParams.opType == HwOpType::CONV_POOL) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.poolKernelWidth));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.poolKernelHeight));
+        }
+
+        serializer.append(checked_cast<uint32_t>(hwOpParams.withReLU));
+        if (hwOpParams.withReLU) {
+            serializer.append(checked_cast<uint32_t>(hwOpParams.t0));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.a0));
+            serializer.append(checked_cast<uint32_t>(hwOpParams.a1));
+        }
+
+        serializer.append(checked_cast<uint32_t>(hwOpParams.withClamp));
+        if (hwOpParams.withClamp) {
+            serializer.append(checked_cast<float>(hwOpParams.clampMaxVal));
+        }
+
+        serializer.append(checked_cast<uint32_t>(hwOpParams.reuseData));
+        serializer.append(checked_cast<uint32_t>(hwOpParams.reuseCoeff));
+    }
+
+    serializer.append(checked_cast<uint32_t>(_injectedStageEdges.size()));
+    for (const auto& injectedStageEdge : _injectedStageEdges) {
+        injectedStageEdge->child()->serialize(serializer);
+    }
+}
+
+void MyriadXHwStage::serializeDataImpl(BlobSerializer& serializer) const {
+    auto numBuffersPos = serializer.append(static_cast<uint32_t>(0));
+
+    uint32_t numBuffers = 0;
+
+    for (const auto& inEdge : _inputEdges) {
+        if (inEdge->childEdge() != nullptr)
+            continue;
+
+        if (inEdge->input()->usage() == DataUsage::Fake)
+            continue;
+
+        inEdge->input()->serializeNewBuffer(serializer);
+
+        ++numBuffers;
+    }
+
+    for (const auto& outEdge : _outputEdges) {
+        if (outEdge->childEdge() != nullptr)
+            continue;
+
+        if (outEdge->output()->usage() == DataUsage::Fake)
+            continue;
+
+        outEdge->output()->serializeNewBuffer(serializer);
+
+        ++numBuffers;
+    }
+
+    serializer.overWrite(numBuffersPos, checked_cast<uint32_t>(numBuffers));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/hw/tiling.cpp b/inference-engine/src/vpu/graph_transformer/src/hw/tiling.cpp
new file mode 100644 (file)
index 0000000..0852026
--- /dev/null
@@ -0,0 +1,513 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/tiling.hpp>
+
+#include <tuple>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <limits>
+#include <utility>
+
+#include <vpu/hw/utility.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+//
+// Tiling scheme
+//
+
+void printTo(std::ostream& os, const HwConvTileInfo& convTiles) {
+    os << "[" << std::endl;
+    os << "mode=" << convTiles.mode << std::endl;
+    os << "numDescr=" << convTiles.numDescr << std::endl;
+    os << "outChansPerDescr=" << convTiles.outChansPerDescr << std::endl;
+    os << "lastOutChans=" << convTiles.lastOutChans << std::endl;
+    os << "extendedInputDimC=" << convTiles.extendedInputDimC << std::endl;
+    os << "extendedOutputDimC=" << convTiles.extendedOutputDimC << std::endl;
+    os << "cost=" << convTiles.cost << std::endl;
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const HwConvTileInfo& convTiles) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("mode", convTiles.mode);
+    subLbl.appendPair("numDescr", convTiles.numDescr);
+    subLbl.appendPair("outChansPerDescr", convTiles.outChansPerDescr);
+    subLbl.appendPair("lastOutChans", convTiles.lastOutChans);
+    subLbl.appendPair("extendedInputDimC", convTiles.extendedInputDimC);
+    subLbl.appendPair("extendedOutputDimC", convTiles.extendedOutputDimC);
+    subLbl.appendPair("cost", convTiles.cost);
+}
+
+void printTo(std::ostream& os, const HwPoolTileInfo& poolTiles) {
+    os << "[" << std::endl;
+    os << "mode=" << poolTiles.mode << std::endl;
+    os << "numDescr=" << poolTiles.numDescr << std::endl;
+    os << "chansPerDescr=" << poolTiles.chansPerDescr << std::endl;
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const HwPoolTileInfo& poolTiles) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("mode", poolTiles.mode);
+    subLbl.appendPair("numDescr", poolTiles.numDescr);
+    subLbl.appendPair("chansPerDescr", poolTiles.chansPerDescr);
+}
+
+void printTo(std::ostream& os, const HwFullyConnectedTileInfo& fcTiles) {
+    os << "[" << std::endl;
+    os << "mode=" << fcTiles.mode << std::endl;
+    os << "numOutTiles=" << fcTiles.numOutTiles << std::endl;
+    os << "numInSubTiles=" << fcTiles.numInSubTiles << std::endl;
+    os << "workInN=" << fcTiles.workInN << std::endl;
+    os << "workOutN=" << fcTiles.workOutN << std::endl;
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const HwFullyConnectedTileInfo& fcTiles) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("mode", fcTiles.mode);
+    subLbl.appendPair("numOutTiles", fcTiles.numOutTiles);
+    subLbl.appendPair("numInSubTiles", fcTiles.numInSubTiles);
+    subLbl.appendPair("workInN", fcTiles.workInN);
+    subLbl.appendPair("workOutN", fcTiles.workOutN);
+}
+
+//
+// Input<->Output tile calculation
+//
+
+int calcOutputSize(
+        int inputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        bool useCeil) {
+    if (useCeil) {
+        return std::ceil(static_cast<double>(inputSize - kernelSize + padBefore + padAfter) / kernelStride + 1);
+    } else {
+        return (inputSize - kernelSize + padBefore + padAfter) / kernelStride + 1;
+    }
+}
+
+//
+// Plane tiles calculation.
+//
+
+std::vector<HwPlaneTileInfo> splitIntoPlaneTilesWithPool(
+        int inputSize,
+        int kernelSize, int kernelStride,
+        int pad,
+        int maxOutputSize) {
+    std::vector<HwPlaneTileInfo> tiles;
+
+    // This is very specific case for 3x3p1s1 convlution, followed by 2x2s2 pooling with even height
+    IE_ASSERT(kernelSize == 3 && kernelStride == 1 && pad == 1);
+    IE_ASSERT(inputSize % 2 == 0);
+
+    // For this specific case, the outputSize is:
+    int outputSize = inputSize / 2;
+
+    IE_ASSERT(inputSize > 0);
+    IE_ASSERT(outputSize > 0);
+
+    if (outputSize > maxOutputSize) {
+        if (maxOutputSize % 2 == 0) {
+            --maxOutputSize;
+        }
+    }
+
+    IE_ASSERT(maxOutputSize >= 2);
+
+    int inputStartIndex = 0;
+    int outputStartIndex = 0;
+
+    while (true) {
+        int inputEndIndex = std::min<int>(inputStartIndex + 2 * maxOutputSize, inputSize);
+        int outputEndIndex = std::min<int>(outputStartIndex + maxOutputSize, outputSize);
+
+        IE_ASSERT(inputEndIndex > inputStartIndex);
+        IE_ASSERT(outputEndIndex > outputStartIndex);
+
+        int trueInputNeeded = inputEndIndex - inputStartIndex;
+        int outputWithJunk = outputEndIndex - outputStartIndex;
+        int junkBefore = outputStartIndex > 0 ?  1 : 0;
+        int junkAfter = outputEndIndex < outputSize ? 1 : 0;
+
+        outputStartIndex += junkBefore;
+        outputEndIndex -= junkAfter;
+
+        HwPlaneTileInfo info;
+        info.inputWithJunk = trueInputNeeded;
+        info.outputWithJunk = outputWithJunk;
+        info.outputJunkBefore = junkBefore;
+        info.outputJunkAfter = junkAfter;
+        info.inputStartIndex = inputStartIndex;
+        info.inputEndIndex = inputEndIndex;
+        info.outputStartIndex = outputStartIndex;
+        info.outputEndIndex = outputEndIndex;
+
+        tiles.emplace_back(info);
+
+        if (outputEndIndex >= outputSize)
+            break;
+
+        auto newInputStartIndex = inputEndIndex - 4;
+        auto newOutputStartIndex = outputEndIndex - 1;
+
+        inputStartIndex = newInputStartIndex;
+        outputStartIndex = newOutputStartIndex;
+    }
+
+    return tiles;
+}
+
+namespace {
+
+// Note:
+//
+//   * [inputStartIndex, inputEndIndex): is the original range, without account for splits
+//   * inputLinesBefore: specifies how many elements we need to remove from inputStartIndex to get the correct starting point
+//   * junkOutputBefore: With starting point inputStartIndex - inputLinesBefore, this value contains the junk lines we need to discard
+//   * [outputStartIndex, outputEndIndex): is the output range we are interested in generating, without the extra junk that might be there
+//   * junkOutputBefore: is the junk contained before the outputStartIndex
+//   * junkOutputAfter: is the junk contained after the outputEndIndex
+//
+// The output generated by the hardware is:
+//
+//   [outputStartIndex - junkOutputBefore, outputEndIndex + junkOutputAfter)
+std::tuple<int, int, int, int, int, int, int, int>
+    inputTileForOutputTile(
+        int inputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        int outputStartIndex, int outputEndIndex,
+        bool alignInputTile) {
+    // Negative value encodes the padding
+    int inputStartIndex = outputStartIndex * kernelStride - padBefore;
+    int inputEndIndex = (outputEndIndex - 1) * kernelStride + kernelSize - padBefore;
+
+    int inputLinesBefore = 0;
+    int junkOutputBefore = 0;
+
+    if (inputStartIndex < 0) {
+        // Negative inputStartIndex means that we use the original padding
+
+        inputLinesBefore = 0;
+        inputStartIndex = 0;
+        if (outputStartIndex == 0) {
+            junkOutputBefore = 0;
+        } else {
+            junkOutputBefore = outputStartIndex;
+        }
+    } else {
+        // Non-negative inputStartIndex means that we either have no padding, or we are in the middle of the image
+
+        // We reduce the inputStartIndex to the smallest non-negative integer
+        inputLinesBefore = inputStartIndex;
+        while (inputLinesBefore >= kernelStride) {
+            inputLinesBefore -= kernelStride;
+        }
+
+        if (alignInputTile) {
+            const int reqAlignment = 8;
+            while ((inputLinesBefore < inputStartIndex) &&
+                   (inputStartIndex - inputLinesBefore) % reqAlignment != 0) {
+                ++inputLinesBefore;
+            }
+        }
+
+        // Compute the junkOutputBefore
+        junkOutputBefore = (inputLinesBefore + padBefore) / kernelStride;
+    }
+
+    int inputLinesAfter = 0;
+    int junkOutputAfter = 0;
+
+    if (inputEndIndex > inputSize) {
+        // Larger inputEndIndex means that we use the original padding at the bottom of the image
+
+        int paddingUsed = inputEndIndex - inputSize;
+
+        inputLinesAfter = 0;
+        inputEndIndex = inputSize;
+
+        // The hardware will continue to compute output lines, until the kernel is just inside the padded image.
+        junkOutputAfter = 0;
+        while (paddingUsed + kernelStride <= padAfter) {
+            paddingUsed += kernelStride;
+            junkOutputAfter += 1;
+        }
+    } else {
+        // This value of inputEndIndex means that we either have no padding, or we are in the middle of the image
+
+        inputLinesAfter = 0;
+
+        // Count how many kernels fit with the provided padding
+        int paddingUsed = 0;
+        junkOutputAfter = 0;
+        while (paddingUsed + kernelStride <= padAfter) {
+            paddingUsed += kernelStride;
+            junkOutputAfter += 1;
+        }
+    }
+
+    return std::make_tuple(inputStartIndex, inputEndIndex,
+                           inputLinesBefore, inputLinesAfter,
+                           outputStartIndex, outputEndIndex,
+                           junkOutputBefore, junkOutputAfter);
+}
+
+int maximizeOutput(
+        int inputSize, int maxOutputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        int outputStartIndex, int outputEndIndex,
+        bool alignInputTile,
+        bool useCeil) {
+    int outputSize = calcOutputSize(inputSize, kernelSize, kernelStride, padBefore, padAfter, useCeil);
+
+    int _ = 0;
+    int junkOutputBefore = 0, junkOutputAfter = 0;
+    std::tie(_, _, _, _, _, _, junkOutputBefore, junkOutputAfter) =
+        inputTileForOutputTile(inputSize, kernelSize, kernelStride, padBefore, padAfter, outputStartIndex, outputEndIndex, alignInputTile);
+
+    int totalOutputSlice = junkOutputBefore + (outputEndIndex - outputStartIndex) + junkOutputAfter;
+
+    auto isValid = [maxOutputSize, outputSize](int totalOutputSlice, int outputEndIndex) -> bool {
+        return totalOutputSlice <= maxOutputSize && outputEndIndex <= outputSize;
+    };
+
+    int extraLines = 0;
+    while (!isValid(totalOutputSlice, outputEndIndex + extraLines)) {
+        extraLines -= 1;
+
+        std::tie(_, _, _, _, _, _, junkOutputBefore, junkOutputAfter) =
+            inputTileForOutputTile(inputSize, kernelSize, kernelStride, padBefore, padAfter, outputStartIndex, outputEndIndex + extraLines, alignInputTile);
+
+        totalOutputSlice = junkOutputBefore + (outputEndIndex + extraLines - outputStartIndex) + junkOutputAfter;
+    }
+
+    return outputEndIndex + extraLines + !isValid(totalOutputSlice, outputEndIndex);
+}
+
+}  // namespace
+
+std::vector<HwPlaneTileInfo> splitIntoPlaneTiles(
+        int inputSize, int outputSize,
+        int kernelSize, int kernelStride,
+        int padBefore, int padAfter,
+        int maxOutputSize,
+        bool alignInputTile,
+        bool useCeil) {
+    IE_ASSERT(inputSize > 0);
+    IE_ASSERT(outputSize > 0);
+    IE_ASSERT(maxOutputSize > 0);
+
+    std::vector<HwPlaneTileInfo> tiles;
+
+    int outputStartIndex = 0;
+
+    while (true) {
+        int outputEndIndex = std::min<int>(outputSize, outputStartIndex + maxOutputSize);
+        IE_ASSERT(outputEndIndex > outputStartIndex);
+
+        int newOutputEndIndex = maximizeOutput(
+            inputSize, maxOutputSize,
+            kernelSize, kernelStride,
+            padBefore, padAfter,
+            outputStartIndex, outputEndIndex,
+            alignInputTile,
+            useCeil);
+        if (newOutputEndIndex <= outputStartIndex) {
+            return std::vector<HwPlaneTileInfo>();
+        }
+
+        int inputStartIndex = 0, inputEndIndex = 0;
+        int inputLinesBefore = 0, inputLinesAfter = 0;
+        int junkOutputBefore = 0, junkOutputAfter = 0;
+        std::tie(inputStartIndex, inputEndIndex,
+                 inputLinesBefore, inputLinesAfter,
+                 outputStartIndex, outputEndIndex,
+                 junkOutputBefore, junkOutputAfter) =
+            inputTileForOutputTile(inputSize, kernelSize, kernelStride, padBefore, padAfter, outputStartIndex, newOutputEndIndex, alignInputTile);
+
+        IE_ASSERT(inputStartIndex >= 0);
+        IE_ASSERT(inputEndIndex >= 0);
+        IE_ASSERT(inputEndIndex > inputStartIndex);
+        IE_ASSERT(inputLinesBefore >= 0);
+        IE_ASSERT(inputLinesAfter >= 0);
+        IE_ASSERT(inputStartIndex - inputLinesBefore >= 0);
+        IE_ASSERT(inputEndIndex + inputLinesAfter >= 0);
+        IE_ASSERT(inputEndIndex + inputLinesAfter <= inputSize);
+        IE_ASSERT(inputLinesBefore + inputEndIndex - inputStartIndex + inputLinesAfter >= 0);
+        IE_ASSERT(junkOutputBefore + outputEndIndex - outputStartIndex + junkOutputAfter >= 0);
+        IE_ASSERT(junkOutputBefore >= 0);
+        IE_ASSERT(junkOutputAfter >= 0);
+        IE_ASSERT(outputStartIndex >= 0);
+        IE_ASSERT(outputEndIndex >= 0);
+        IE_ASSERT(outputEndIndex <= outputSize);
+
+        HwPlaneTileInfo info;
+        info.inputWithJunk = inputLinesBefore + inputEndIndex - inputStartIndex + inputLinesAfter;
+        info.outputWithJunk = junkOutputBefore + outputEndIndex - outputStartIndex + junkOutputAfter;
+        info.outputJunkBefore = junkOutputBefore;
+        info.outputJunkAfter = junkOutputAfter;
+        info.inputStartIndex = inputStartIndex - inputLinesBefore;
+        info.inputEndIndex = inputEndIndex + inputLinesAfter;
+        info.outputStartIndex = outputStartIndex;
+        info.outputEndIndex = outputEndIndex;
+
+        tiles.emplace_back(info);
+
+        if (outputEndIndex >= outputSize)
+            break;
+
+        auto newOutputStartIndex = outputEndIndex;
+        IE_ASSERT(newOutputStartIndex > outputStartIndex);
+
+        outputStartIndex = newOutputStartIndex;
+    }
+
+    return tiles;
+}
+
+//
+// HW Convolution tiling over output channels.
+//
+
+namespace {
+
+// Returns (status, cost).
+std::tuple<bool, int> checkHwConvMode(
+        int inTileWidth, int inTileHeight, int inTileChannels,
+        int outTileChannels,
+        int kernelSizeX, int kernelSizeY,
+        int kernelStride,
+        HwOpMode mode) {
+    if (inTileWidth > CNN_MAX_INPUT_WIDTH ||
+        inTileHeight > CNN_MAX_INPUT_HEIGHT ||
+        inTileChannels > CNN_MAX_INPUT_CHANNELS ||
+        outTileChannels > CNN_MAX_OUTPUT_CHANNELS) {
+        return std::make_tuple(false, 0);
+    }
+
+    auto noOfBlocks = 1 << static_cast<int>(mode);
+    if (noOfBlocks > inTileChannels) {
+        return std::make_tuple(false, 0);
+    }
+
+    auto inChansPerBlock = inTileChannels / noOfBlocks;
+    if (inChansPerBlock > CNN_MAX_CHANNELS_PER_BLOCK) {
+        return std::make_tuple(false, 0);
+    }
+
+    auto coeffPerWord = CNN_COEFF_PER_WORD_VALUES[static_cast<int32_t>(CNN_COEFF_TYPE)];
+    auto coeffSetSize = kernelSizeX * kernelSizeY;
+    auto coeffLPB = (inChansPerBlock * coeffSetSize + coeffPerWord - 1) / coeffPerWord;
+    if (coeffLPB > CNN_MAX_COEFF_PER_BLOCK) {
+        return std::make_tuple(false, 0);
+    }
+
+    auto bytesPerPixel = CNN_BYTES_PER_PIXEL[static_cast<int32_t>(CNN_DATA_TYPE)];
+    auto pixelsPerCMXLine = 128 / (bytesPerPixel * 8);
+    auto localLineStride = (inTileWidth + (pixelsPerCMXLine - 1)) / pixelsPerCMXLine;
+    auto bytesPerLine = localLineStride * pixelsPerCMXLine * bytesPerPixel;
+    auto sizeOfBlock = CNN_MAX_BYTES >> static_cast<int>(mode);
+    auto chanPerBlock = inTileChannels / noOfBlocks;
+    if (chanPerBlock == 0) {
+        return std::make_tuple(false, 0);
+    }
+
+    auto availableBytesPerChan = sizeOfBlock / chanPerBlock;
+    auto linesPerChan = std::min(availableBytesPerChan / bytesPerLine, inTileHeight);
+    auto minLines = std::min(kernelSizeY / 1 + (kernelStride + 1) + 1 + ((inTileWidth <= 8) ? 1 : 0), inTileHeight);
+    if (minLines > linesPerChan) {
+        return std::make_tuple(false, 0);
+    }
+
+    return std::make_tuple(true, (inTileChannels / noOfBlocks) * kernelSizeX * kernelSizeY + CNN_MODES_COST[static_cast<int32_t>(mode)]);
+}
+
+}  // namespace
+
+HwConvTileInfo splitHwConvIntoOutChannelsTiles(
+        int inTileWidth, int inTileHeight, int inTileChannels,
+        int outTileChannels,
+        int kernelSizeX, int kernelSizeY,
+        int kernelStride) {
+    struct Solution final {
+        HwOpMode mode = HwOpMode::MODE_1_256;
+        int extendedInputDimC = 0;
+        int extendedOutputDimC = 0;
+        int numDescr = 0;
+        int outChansPerDescr = 0;
+        int remOutChans = 0;
+        int cost = std::numeric_limits<int>::max();
+    };
+
+    Solution bestSol;
+
+    for (auto mode : CNN_MODES) {
+        auto ramBlocks = 1 << static_cast<int>(mode);
+
+        auto extendedInputDimC = alignVal(inTileChannels, ramBlocks);
+        auto extendedOutputDimC = alignVal(outTileChannels, 8);
+
+        auto outChansPerDescr = std::min(256 / ramBlocks, extendedOutputDimC);
+
+        bool valid = false;
+        int descCost = 0;
+        std::tie(valid, descCost) = checkHwConvMode(
+            inTileWidth, inTileHeight, extendedInputDimC,
+            outChansPerDescr,
+            kernelSizeX, kernelSizeY,
+            kernelStride,
+            mode);
+
+        if (!valid) {
+            continue;
+        }
+
+        auto numDescr = divUp(outTileChannels, outChansPerDescr);
+        auto remOutChans = outTileChannels - (numDescr - 1) * outChansPerDescr;
+
+        Solution curSol;
+        curSol.mode = mode;
+        curSol.extendedInputDimC = extendedInputDimC;
+        curSol.extendedOutputDimC = extendedOutputDimC;
+        curSol.numDescr = numDescr;
+        curSol.outChansPerDescr = outChansPerDescr;
+        curSol.remOutChans = remOutChans;
+        curSol.cost = numDescr * descCost;
+
+        if (curSol.cost < bestSol.cost || (curSol.cost == bestSol.cost && curSol.numDescr < bestSol.numDescr)) {
+            bestSol = curSol;
+        }
+    }
+
+    if (bestSol.numDescr == 0) {
+        return HwConvTileInfo();
+    }
+
+    IE_ASSERT(bestSol.extendedInputDimC > 0);
+    IE_ASSERT(bestSol.extendedOutputDimC > 0);
+    IE_ASSERT(bestSol.numDescr > 0);
+    IE_ASSERT(bestSol.outChansPerDescr > 0);
+
+    HwConvTileInfo tileInfo;
+    tileInfo.mode = bestSol.mode;
+    tileInfo.numDescr = bestSol.numDescr;
+    tileInfo.outChansPerDescr = bestSol.outChansPerDescr;
+    tileInfo.lastOutChans = bestSol.remOutChans > 0 ? bestSol.remOutChans : bestSol.outChansPerDescr;
+    tileInfo.extendedInputDimC = bestSol.extendedInputDimC;
+    tileInfo.extendedOutputDimC = bestSol.extendedOutputDimC;
+    tileInfo.cost = bestSol.cost;
+
+    return tileInfo;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/hw/utility.cpp b/inference-engine/src/vpu/graph_transformer/src/hw/utility.cpp
new file mode 100644 (file)
index 0000000..83edd50
--- /dev/null
@@ -0,0 +1,190 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/utility.hpp>
+
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#include <ie_parallel.hpp>
+
+#include <vpu/model/stage.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+//
+// HwDescriptors
+//
+
+void printTo(std::ostream& os, const HwOpList& hwOps) {
+    os << "[" << std::endl;
+    os << "size=" << hwOps.vec.size() << std::endl;
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const HwOpList& hwOps) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("size", hwOps.vec.size());
+}
+
+//
+// HwPaddingInfo
+//
+
+HwPaddingInfo getHwPaddingInfo(
+        const DimValues& inDims, const DimValues& outDims,
+        int kernelDimX, int kernelDimY,
+        int kernelStrideX, int kernelStrideY) {
+    int valid_out_x = std::ceil(static_cast<double>(inDims[Dim::W] - kernelDimX + 1) / kernelStrideX);
+    int valid_out_y = std::ceil(static_cast<double>(inDims[Dim::H] - kernelDimY + 1) / kernelStrideY);
+
+    auto pad_along_x = (outDims[Dim::W] - 1) * kernelStrideX + kernelDimX - inDims[Dim::W];
+    auto pad_along_y = (outDims[Dim::H] - 1) * kernelStrideY + kernelDimY - inDims[Dim::H];
+
+    HwPaddingInfo pad;
+
+    pad.left = pad_along_x / 2;
+    pad.right = pad_along_x - pad.left;
+    pad.top = pad_along_y / 2;
+    pad.bottom = pad_along_y - pad.top;
+
+    pad.enable = (outDims[Dim::W] != valid_out_x || outDims[Dim::H] != valid_out_y);
+
+    return pad;
+}
+
+void printTo(std::ostream& os, const HwPaddingInfo& hwPad) {
+    os << "[" << std::endl;
+    os << "enable=" << hwPad.enable << std::endl;
+    if (hwPad.enable) {
+        os << "left=" << hwPad.left << std::endl;
+        os << "right=" << hwPad.right << std::endl;
+        os << "top=" << hwPad.top << std::endl;
+        os << "bottom=" << hwPad.bottom << std::endl;
+    }
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("enable", hwPad.enable);
+    if (hwPad.enable) {
+        subLbl.appendPair("left", hwPad.left);
+        subLbl.appendPair("right", hwPad.right);
+        subLbl.appendPair("top", hwPad.top);
+        subLbl.appendPair("bottom", hwPad.bottom);
+    }
+}
+
+//
+// HwWeightsContent
+//
+
+HwWeightsContent::HwWeightsContent(const DataContent::Ptr& origContent,
+        const DataDesc& origWeightsDesc,
+        int numInputChannels,
+        int channelStartIndex) :
+        CalculatedDataContent({origContent}),
+        _origWeightsDesc(origWeightsDesc),
+        _numInputChannels(numInputChannels),
+        _channelStartIndex(channelStartIndex) {
+}
+
+void HwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
+    VPU_PROFILE(HwWeightsContent);
+
+    IE_ASSERT(_desc.type() == DataType::FP16);
+    IE_ASSERT(baseContents.size() == 1);
+
+    auto KX = _origWeightsDesc.dim(Dim::W);
+    auto KY = _origWeightsDesc.dim(Dim::H);
+    auto IC = _origWeightsDesc.dim(Dim::C);
+    auto OC = _origWeightsDesc.dim(Dim::N);
+    auto origTotalSize = _origWeightsDesc.totalDimSize();
+
+    auto HW_OC_inner = desc().dim(Dim::W);
+    auto HW_OC_outer = desc().dim(Dim::N);
+    IE_ASSERT(HW_OC_outer * HW_OC_inner >= OC);
+
+    auto HW_K = desc().dim(Dim::H);
+    IE_ASSERT(HW_K == HW_K);
+
+    IE_ASSERT(_channelStartIndex < IC);
+    auto HW_IC = desc().dim(Dim::C);
+    auto HW_IC_real = std::min(_numInputChannels, IC - _channelStartIndex);
+
+    auto srcData = baseContents[0]->get<fp16_t>();
+    IE_ASSERT(srcData != nullptr);
+
+    auto dstData = static_cast<fp16_t*>(tempBuf);
+
+    IE_ASSERT((_channelStartIndex + HW_IC_real) * HW_K + (OC - 1) * HW_K * IC - 1 < origTotalSize);
+    IE_ASSERT((OC - 1) % HW_OC_inner +
+              (HW_K - 1) * HW_OC_inner +
+              (HW_IC_real - 1) * HW_OC_inner * HW_K +
+              ((OC - 1) / 8) * HW_OC_inner * HW_K * HW_IC < _desc.totalDimSize());
+
+    if (KX == 1 && KY == 1) {
+        ie::parallel_for(OC, [=](int oc) {
+            auto oc_inner = oc % HW_OC_inner;
+            auto oc_outer = oc / HW_OC_inner;
+            for (int ic = 0; ic < HW_IC_real; ++ic) {
+                auto srcInd =
+                        (_channelStartIndex + ic) +
+                        oc * IC;
+                auto dstInd =
+                        oc_inner +
+                        ic * HW_OC_inner * HW_K +
+                        oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+                dstData[dstInd] = srcData[srcInd];
+            }
+        });
+    } else {
+        ie::parallel_for(OC, [=](int oc) {
+            auto oc_inner = oc % HW_OC_inner;
+            auto oc_outer = oc / HW_OC_inner;
+            for (int ic = 0; ic < HW_IC_real; ++ic) {
+                for (int ky = 0; ky < KY; ++ky) {
+                    for (int kx = 0; kx < KX; ++kx) {
+                        auto srcInd =
+                                (kx + ky * KX) +
+                                (_channelStartIndex + ic) * HW_K +
+                                oc * HW_K * IC;
+                        auto dstInd =
+                                oc_inner +
+                                (ky * KX + kx) * HW_OC_inner +
+                                ic * HW_OC_inner * HW_K +
+                                oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+                        dstData[dstInd] = srcData[srcInd];
+                    }
+                }
+            }
+        });
+    }
+}
+//
+// calculateHwBufferSize
+//
+
+int calculateHwBufferSize(const DimValues& dims, DimsOrder order) {
+    if (order.empty()) {
+        order = DimsOrder::fromNumDims(dims.size());
+    }
+
+    DataDesc desc(DataType::FP16, order, dims);
+
+    if (desc.numDims() > 2) {
+        return calcTotalByteSize(desc, calcStrides(desc, StridesRequirement().add(1, DimStride::Aligned)));
+    } else {
+        IE_ASSERT(desc.dimsOrder() == DimsOrder::NC);
+
+        return calcTotalByteSize(desc, calcStrides(desc, StridesRequirement().add(0, DimStride::Aligned)));
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data.cpp
new file mode 100644 (file)
index 0000000..e6f75f4
--- /dev/null
@@ -0,0 +1,667 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data.hpp>
+
+#include <array>
+#include <algorithm>
+#include <queue>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <string>
+#include <set>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/backend/backend.hpp>
+
+namespace vpu {
+
+//
+// DataContent
+//
+
+const void* CalculatedDataContent::getRaw() const {
+    if (_temp.empty()) {
+        _temp.resize(getTempBufSize(_baseContents));
+        fillTempBuf(_baseContents, _temp.data());
+        _baseContents.clear();
+    }
+    return _temp.data();
+}
+
+size_t CalculatedDataContent::getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const {
+    return _desc.totalDimSize() * _desc.elemSize();
+}
+
+namespace {
+
+class IeBlobContent final : public DataContent {
+public:
+    IeBlobContent(const ie::Blob::Ptr& blob, int repeat) : _blob(blob), _repeat(repeat) {}
+
+protected:
+    const void* getRaw() const override {
+        IE_ASSERT(_desc.type() == DataType::FP16);
+
+        if (_blobFp16 == nullptr) {
+            _blobFp16 = getBlobFP16(_blob);
+            _blob.reset();
+        }
+
+        if (_repeat == 1) {
+            return _blobFp16->cbuffer();
+        } else {
+            if (_temp.empty()) {
+                VPU_PROFILE(IeBlobContent);
+
+                IE_ASSERT(_desc.totalDimSize() % _repeat == 0);
+
+                auto origNumElems = _desc.totalDimSize() / _repeat;
+                IE_ASSERT(origNumElems <= _blobFp16->size());
+
+                auto origPtr = _blobFp16->cbuffer().as<const fp16_t*>();
+                IE_ASSERT(origPtr != nullptr);
+
+                _temp.resize(_desc.totalDimSize());
+
+                ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
+                    std::copy_n(origPtr, origNumElems, _temp.data() + i * origNumElems);
+                });
+            }
+
+            return _temp.data();
+        }
+    }
+
+private:
+    mutable ie::Blob::Ptr _blob;
+    int _repeat = 0;
+
+    mutable ie::Blob::Ptr _blobFp16;
+    mutable std::vector<fp16_t> _temp;
+};
+
+}  // namespace
+
+DataContent::Ptr ieBlobContent(const ie::Blob::Ptr& blob, int repeat) {
+    return std::make_shared<IeBlobContent>(blob, repeat);
+}
+
+namespace {
+
+class ReplicatedContent final : public CalculatedDataContent {
+public:
+    ReplicatedContent(float val, int count) : _val(val), _count(count) {}
+
+    ReplicatedContent(const DataContent::Ptr& origContent, int count) :
+            CalculatedDataContent({origContent}), _count(count) {
+    }
+
+protected:
+    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const override {
+        if (baseContents.empty()) {
+            return _count * sizeof(fp16_t);
+        } else {
+            IE_ASSERT(baseContents.size() == 1);
+            IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+            return _desc.totalDimSize() * sizeof(fp16_t);
+        }
+    }
+
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(ReplicatedContent);
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+        if (baseContents.empty()) {
+            std::fill_n(dstPtr, _count, ie::PrecisionUtils::f32tof16(_val));
+        } else {
+            IE_ASSERT(baseContents.size() == 1);
+            IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+            auto origCount = _desc.totalDimSize() / _count;
+            auto origPtr = baseContents[0]->get<fp16_t>();
+            IE_ASSERT(origPtr != nullptr);
+
+            ie::parallel_for(_count, [origPtr, origCount, dstPtr](int i) {
+                std::copy_n(origPtr, origCount, dstPtr + i * origCount);
+            });
+        }
+    }
+
+private:
+    float _val = 0.0f;
+    int _count = 0;
+};
+
+}  // namespace
+
+DataContent::Ptr replicateContent(
+        float val,
+        int count) {
+    return std::make_shared<ReplicatedContent>(val, count);
+}
+
+DataContent::Ptr replicateContent(
+        const DataContent::Ptr& origContent,
+        int count) {
+    return std::make_shared<ReplicatedContent>(origContent, count);
+}
+
+namespace {
+
+class ScaledContent final : public CalculatedDataContent {
+public:
+    ScaledContent(
+            const DataContent::Ptr& origContent,
+            float scale) :
+            CalculatedDataContent({origContent}), _scale(scale) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(ScaledContent);
+
+        IE_ASSERT(baseContents.size() == 1);
+
+        auto totalSize = _desc.totalDimSize();
+
+        auto origDesc = baseContents[0]->desc();
+        IE_ASSERT(origDesc.type() == DataType::FP16);
+        IE_ASSERT(origDesc.totalDimSize() == totalSize);
+
+        auto srcPtr = baseContents[0]->get<fp16_t>();
+        IE_ASSERT(srcPtr != nullptr);
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+        ie::parallel_for(totalSize, [this, srcPtr, dstPtr](int i) {
+            dstPtr[i] = ie::PrecisionUtils::f32tof16(ie::PrecisionUtils::f16tof32(srcPtr[i]) * _scale);
+        });
+    }
+
+private:
+    float _scale = 0.0f;
+};
+
+}  // namespace
+
+DataContent::Ptr scaleContent(
+        const DataContent::Ptr& origContent,
+        float scale) {
+    return std::make_shared<ScaledContent>(origContent, scale);
+}
+
+//
+// DataNode
+//
+
+Data DataNode::getTopParentData() const {
+    auto topParent = handle_from_this();
+    while (auto nextParent = topParent->parentData()) {
+        topParent = nextParent;
+    }
+    return topParent;
+}
+
+DimValues DataNode::strides() const {
+    if (_parentDataEdge != nullptr) {
+        if (_parentDataEdge->mode() == SharedDataMode::ROI) {
+            return _parentDataEdge->parent()->strides();
+        }
+    }
+
+    return calcStrides(_desc, _requiredStrides);
+}
+
+int DataNode::totalByteSize() const {
+    // IT doesn't have sence for child Data.
+    IE_ASSERT(_parentDataEdge == nullptr);
+
+    return calcTotalByteSize(_desc, strides());
+}
+
+int DataNode::elemOffset(const DimValues& coord) const {
+    auto strides = this->strides();
+
+    int res = 0;
+    for (const auto& p : coord) {
+        IE_ASSERT(_desc.dimsOrder().hasDim(p.first));
+        IE_ASSERT(p.second < _desc.dim(p.first));
+        res += p.second * strides[p.first];
+    }
+
+    return res;
+}
+
+int DataNode::lastElemOffset() const {
+    DimValues lastElem;
+    for (const auto& p : _desc.dims()) {
+        lastElem.set(p.first, p.second - 1);
+    }
+    return elemOffset(lastElem);
+}
+
+bool DataNode::checkStrides(const StridesRequirement& reqs) const {
+    return vpu::checkStrides(_desc, strides(), reqs);
+}
+
+void DataNode::updateRequiredStrides(const StridesRequirement& newReqs) {
+    // There shouldn't be any Data<->Data edges.
+    IE_ASSERT(_parentDataEdge == nullptr);
+    IE_ASSERT(_childDataEdges.empty());
+
+    auto prevReqs = _requiredStrides;
+
+    StridesRequirement mergedReqs;
+    for (int i = 0; i < _desc.numDims(); ++i) {
+        auto prevReq = prevReqs.get(i);
+        auto newReq = newReqs.get(i);
+
+        if (prevReq == DimStride::Any &&
+            newReq == DimStride::Any) {
+            continue;
+        }
+
+        // In case if both requirements are defined, use `prevReq`.
+        // We'll check that both requirements are satisfied at the end.
+        if (prevReq != DimStride::Any) {
+            mergedReqs.add(i, prevReq);
+        } else {
+            mergedReqs.add(i, newReq);
+        }
+    }
+
+    _requiredStrides = mergedReqs;
+
+    IE_ASSERT(checkStrides(prevReqs));
+    IE_ASSERT(checkStrides(newReqs));
+}
+
+void DataNode::clearAllocation() {
+    _location = DataLocation::None;
+    _memoryOffset = 0;
+    attrs().erase("ioBufferOffset");
+}
+
+void DataNode::setMemReqs(MemoryType mem) {
+    if (mem != MemoryType::DDR) {
+        IE_ASSERT(_usage == DataUsage::Intermediate);
+    }
+
+    _memReqs = mem;
+}
+
+void DataNode::setIOInfo(DataLocation location, int ioBufferOffset) {
+    IE_ASSERT(_usage == DataUsage::Input || _usage == DataUsage::Output);
+
+    if (_usage == DataUsage::Input) {
+        IE_ASSERT(location == DataLocation::Input);
+    } else if (_usage == DataUsage::Output) {
+        IE_ASSERT(location == DataLocation::Output);
+    }
+
+    _location = location;
+    _memoryOffset = 0;
+    attrs().set<int>("ioBufferOffset", ioBufferOffset);
+}
+
+void DataNode::setAllocationInfo(DataLocation location, int memoryOffset) {
+    IE_ASSERT(_usage == DataUsage::Const || _usage == DataUsage::Intermediate || _usage == DataUsage::Temp);
+
+    if (_usage == DataUsage::Const) {
+        IE_ASSERT(location == DataLocation::Blob);
+    } else if (_usage == DataUsage::Temp) {
+        IE_ASSERT(location == DataLocation::BSS);
+    }
+
+    _location = location;
+    _memoryOffset = memoryOffset;
+}
+
+void DataNode::serializeNewBuffer(
+        BlobSerializer& serializer,
+        DimsOrder newOrder) {
+    if (newOrder.numDims() == 0) {
+        serializeBufferImpl(serializer, _desc, this->strides());
+    } else {
+        IE_ASSERT(newOrder.numDims() >= _desc.dimsOrder().numDims());
+
+        auto newDims = _desc.dims();
+        auto newStrides = this->strides();
+        auto newPerm = newOrder.toPermutation();
+
+        auto origOrder = _desc.dimsOrder();
+        auto origPerm = origOrder.toPermutation();
+
+        int origPermInd = 0;
+        for (int i = 0; i < newPerm.size(); i++) {
+            auto d = newPerm[i];
+
+            if (origPermInd < origPerm.size() && origPerm[origPermInd] == d) {
+                ++origPermInd;
+                continue;
+            }
+
+            newDims.set(d, 1);
+            if (i == 0) {
+                newStrides.set(d, _desc.elemSize());
+            } else {
+                newStrides.set(d, newStrides[newPerm[i - 1]] * newDims[newPerm[i - 1]]);
+            }
+        }
+        IE_ASSERT(origPermInd == origPerm.size());
+
+        DataDesc newDesc(_desc.type(), newOrder, newDims);
+        serializeBufferImpl(serializer, newDesc, newStrides);
+    }
+}
+
+namespace {
+
+// Decreases all order's valuable digits simultaneously so minimal digit is equal 1
+void rebaseOrderToOne(DimsOrder& ord, DimValues& dims, DimValues& strides) {
+    auto perm = ord.toPermutation();
+    IE_ASSERT(!perm.empty());
+
+    auto minDim = MAX_DIMS_64 + 1;
+    for (auto d : perm) {
+        minDim = std::min(minDim, static_cast<int>(d));
+    }
+
+    DimValues newDims;
+    DimValues newStrides;
+
+    for (int i = 0; i < perm.size(); ++i) {
+        auto oldDim = perm[i];
+        auto newDim = static_cast<Dim>(static_cast<int>(oldDim) - minDim);
+
+        perm[i] = newDim;
+        newDims.set(newDim, dims[oldDim]);
+        newStrides.set(newDim, strides[oldDim]);
+    }
+
+    ord = DimsOrder::fromPermutation(perm);
+    dims = newDims;
+    strides = newStrides;
+}
+
+}  // namespace
+
+void DataNode::serializeOldBuffer(
+        const Stage& stage,
+        BlobSerializer& serializer,
+        DimsOrder newOrder,
+        const EnumMap<Dim, std::vector<Dim>>& dimsReloc) {
+    const int OLD_FORMAT_NUM_DIMS = 3;
+
+    auto newDims = _desc.dims();
+    auto newStrides = this->strides();
+
+    //
+    // Apply alternative DimsOrder if any.
+    //
+
+    if (newOrder.numDims() == 0) {
+        newOrder = _desc.dimsOrder();
+    } else {
+        IE_ASSERT(newOrder.numDims() == OLD_FORMAT_NUM_DIMS);
+
+        auto origPerm = _desc.dimsOrder().toPermutation();
+        auto origIndeces = _desc.dimsOrder().toIndices();
+        auto origDims = newDims;
+        auto origStrides = newStrides;
+
+        auto newPerm = newOrder.toPermutation();
+
+        newDims.clear();
+        newStrides.clear();
+
+        //
+        // Move real dims and strides according ro relocation map
+        //
+
+        EnumSet<Dim> usedOrigDims;
+        int prevOrigDimInd = -1;
+
+        for (int i = 0; i < newPerm.size(); ++i) {
+            auto newDim = newPerm[i];
+
+            int newDimVal = 1;
+            int newStrideVal = 0;
+            if (i == 0) {
+                newStrideVal = _desc.elemSize();
+            } else {
+                newStrideVal = newStrides[newPerm[i - 1]] * newDims[newPerm[i - 1]];
+            }
+
+            auto it = dimsReloc.find(newDim);
+            if (it != dimsReloc.end()) {
+                auto origDimsToReloc = it->second;
+                IE_ASSERT(!origDimsToReloc.empty());
+
+                for (int j = 0; j < origDimsToReloc.size(); ++j) {
+                    auto origDim = origDimsToReloc[j];
+                    auto origDimInd = origIndeces[origDim];
+
+                    IE_ASSERT(usedOrigDims.count(origDim) == 0);
+                    IE_ASSERT(_desc.dimsOrder().hasDim(origDim));
+                    IE_ASSERT(origDimInd == prevOrigDimInd + 1);
+
+                    usedOrigDims.insert(origDim);
+
+                    if (j > 0 && origDims[origDim] > 1) {
+                        IE_ASSERT(checkStride(origStrides, _desc, origDimInd, DimStride::Compact));
+                    }
+
+                    newDimVal *= origDims[origDim];
+                    if (j == 0) {
+                        newStrideVal = origStrides[origDim];
+                    }
+
+                    prevOrigDimInd = origDimInd;
+                }
+            }
+
+            newDims.set(newDim, newDimVal);
+            newStrides.set(newDim, newStrideVal);
+        }
+
+        IE_ASSERT(usedOrigDims.size() == origDims.size());
+        for (auto usedDim : usedOrigDims) {
+            IE_ASSERT(_desc.dimsOrder().hasDim(usedDim));
+        }
+    }
+
+    //
+    // Adjust num dims and dims order to FixedNumDims
+    //
+
+    auto newPerm = newOrder.toPermutation();
+    IE_ASSERT(!newPerm.empty());
+
+    int maxDimDigit = -1;
+    for (auto d : newPerm) {
+        maxDimDigit = std::max(maxDimDigit, static_cast<int>(d));
+    }
+    IE_ASSERT(maxDimDigit >= 0);
+
+    if (newPerm.size() < OLD_FORMAT_NUM_DIMS) {
+        for (int i = newPerm.size(); i < OLD_FORMAT_NUM_DIMS; i++) {
+            auto lastDim = newPerm.back();
+            auto newLastDim = static_cast<Dim>(++maxDimDigit);
+
+            newDims.set(newLastDim, 1);
+            newStrides.set(newLastDim, newStrides[lastDim] * newDims[lastDim]);
+
+            newPerm.emplace_back(newLastDim);
+        }
+
+        newOrder = DimsOrder::fromPermutation(newPerm);
+    }
+
+    if (newPerm.size() > OLD_FORMAT_NUM_DIMS) {
+        for (int i = OLD_FORMAT_NUM_DIMS; i < newPerm.size(); i++) {
+            IE_ASSERT(newDims[newPerm[i]] == 1);
+            newDims.erase(newPerm[i]);
+            newStrides.erase(newPerm[i]);
+        }
+
+        newPerm.resize(OLD_FORMAT_NUM_DIMS);
+
+        newOrder = DimsOrder::fromPermutation(newPerm);
+    }
+
+    rebaseOrderToOne(newOrder, newDims, newStrides);
+
+    IE_ASSERT(newOrder.numDims() == OLD_FORMAT_NUM_DIMS);
+    IE_ASSERT(newOrder == DimsOrder::HWC || newOrder == DimsOrder::CHW || newOrder == DimsOrder::HCW);
+
+    //
+    // Create new DataDesc
+    //
+
+    DataDesc newDesc(_desc.type(), newOrder, newDims);
+
+    if (stage != nullptr) {
+        for (const auto& inEdge : stage->inputEdges()) {
+            if (inEdge->input() == handle_from_this()) {
+                inEdge->attrs().set<DataDesc>("newDesc", newDesc);
+                inEdge->attrs().set<DimValues>("newStrides", newStrides);
+            }
+        }
+        for (const auto& outEdge : stage->outputEdges()) {
+            if (outEdge->output() == handle_from_this()) {
+                outEdge->attrs().set<DataDesc>("newDesc", newDesc);
+                outEdge->attrs().set<DimValues>("newStrides", newStrides);
+            }
+        }
+    }
+
+    //
+    // Serialize update data
+    //
+
+    serializeBufferImpl(serializer, newDesc, newStrides);
+}
+
+void DataNode::serializeIOInfo(BlobSerializer& serializer) const {
+    auto ioIdx = attrs().get<int>("ioIdx");
+    serializer.append(checked_cast<uint32_t>(ioIdx));
+
+    auto ioBufferOffset = attrs().get<int>("ioBufferOffset");
+    serializer.append(checked_cast<uint32_t>(ioBufferOffset));
+
+    auto nameLength = checked_cast<uint32_t>(_name.length());
+    auto nameLengthAligned = alignVal(nameLength, 16u);
+
+    serializer.append(nameLengthAligned);
+    for (auto c : _name) {
+        serializer.append(c);
+    }
+    for (uint32_t i = 0; i < nameLengthAligned - nameLength; ++i) {
+        serializer.append(uint8_t(0));
+    }
+
+    serializeDescImpl(serializer, _desc, strides());
+}
+
+void DataNode::serializeDescImpl(
+        BlobSerializer& serializer,
+        const DataDesc& storedDesc,
+        const DimValues& storedStrides) const {
+    IE_ASSERT(storedDesc.numDims() <= MAX_DIMS_32);
+
+    const auto& storedDims = storedDesc.dims();
+
+    auto storedDimsOrder = storedDesc.dimsOrder();
+
+    auto storedPerm = storedDimsOrder.toPermutation();
+    IE_ASSERT(!storedPerm.empty());
+
+    serializer.append(checked_cast<uint32_t>(storedDesc.type()));
+    serializer.append(checked_cast<uint32_t>(storedDimsOrder.code()));
+
+    serializer.append(checked_cast<uint32_t>(storedPerm.size()));
+    for (auto d : storedPerm) {
+        serializer.append(checked_cast<uint32_t>(storedDims[d]));
+    }
+    for (auto d : storedPerm) {
+        serializer.append(checked_cast<uint32_t>(storedStrides[d]));
+    }
+}
+
+void DataNode::serializeBufferImpl(
+        BlobSerializer& serializer,
+        const DataDesc& storedDesc,
+        const DimValues& storedStrides) const {
+    serializeDescImpl(serializer, storedDesc, storedStrides);
+
+    serializer.append(checked_cast<uint32_t>(_location));
+
+    if (_location == DataLocation::Input || _location == DataLocation::Output) {
+        auto topParent = getTopParentData();
+
+        auto ioIdx = topParent->attrs().get<int>("ioIdx");
+        serializer.append(checked_cast<uint32_t>(ioIdx));
+
+        auto parentByteSize = topParent->totalByteSize();
+        serializer.append(checked_cast<uint32_t>(parentByteSize));
+    }
+
+    serializer.append(checked_cast<uint32_t>(_memoryOffset));
+}
+
+void printTo(std::ostream& os, const Data& data) {
+    os << (data == nullptr ? "<null>" : data->name());
+}
+
+//
+// loopOverData
+//
+
+namespace {
+
+struct StopSignal final {};
+
+void loopOverDataImpl(
+        const Data& data,
+        const FuncRef<DataLoopStatus(const Data&)>& op) {
+    for (const auto& childData : data->childDatas()) {
+        auto status = op(childData);
+
+        if (status == DataLoopStatus::NextChild) {
+            loopOverDataImpl(childData, op);
+        } else if (status == DataLoopStatus::Stop) {
+            throw StopSignal();
+        }
+    }
+}
+
+}  // namespace
+
+void loopOverData(
+        const Data& data,
+        const FuncRef<DataLoopStatus(const Data&)>& op) {
+    auto status = op(data);
+    if (status != DataLoopStatus::NextChild)
+        return;
+
+    try {
+        loopOverDataImpl(data, op);
+    } catch (const StopSignal&) {
+        return;
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_desc.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_desc.cpp
new file mode 100644 (file)
index 0000000..5af56fc
--- /dev/null
@@ -0,0 +1,525 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_desc.hpp>
+
+#include <array>
+#include <algorithm>
+#include <queue>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <limits>
+
+#include <precision_utils.h>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+//
+// DimsOrder
+//
+
+namespace {
+
+const StorageOrder64 ORDER_MASK = static_cast<StorageOrder64>(-1ull) >> (std::numeric_limits<StorageOrder64>::digits / 4 - MAX_DIMS_64);
+
+}  // namespace
+
+StorageOrder64 maskOrder(StorageOrder64 fullOrder, int size) {
+    StorageOrder64 mask = ~ORDER_MASK | ~(static_cast<StorageOrder64>(-1ull) << (size * 4));
+    return fullOrder & mask;
+}
+
+DimsOrder DimsOrder::C = DimsOrder::fromCode(0x3);
+DimsOrder DimsOrder::NC = DimsOrder::fromCode(0x43);
+DimsOrder DimsOrder::CHW = DimsOrder::fromCode(0x321);
+DimsOrder DimsOrder::HWC = DimsOrder::fromCode(0x213);
+DimsOrder DimsOrder::HCW = DimsOrder::fromCode(0x231);
+DimsOrder DimsOrder::NCHW = DimsOrder::fromCode(0x4321);
+DimsOrder DimsOrder::NHWC = DimsOrder::fromCode(0x4213);
+DimsOrder DimsOrder::NHCW = DimsOrder::fromCode(0x4231);
+
+namespace {
+
+bool isOrderCodeValid(StorageOrder64 order) {
+    if (order == 0) {
+        return false;
+    }
+
+    std::unordered_set<int> usedDims;
+
+    auto orderCopy = order;
+
+    int length = 0;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        int digit = orderCopy & 0xF;
+        if (digit == 0) {
+            break;
+        }
+
+        --digit;
+
+        // Dimension is used more than once
+        if (usedDims.count(digit) > 0) {
+            return false;
+        }
+        usedDims.insert(digit);
+
+        length = i + 1;
+
+        orderCopy >>= 4;
+    }
+
+    orderCopy = order >> (4 * length);
+
+    // All digits on positions upper or equal to the order length should be UNDEF
+    for (int i = length; i < MAX_DIMS_64; i++) {
+        int digit = orderCopy & 0xF;
+        if (digit != 0) {
+            break;
+        }
+
+        orderCopy >>= 4;
+    }
+
+    return true;
+}
+
+}  // namespace
+
+DimsOrder DimsOrder::fromCode(StorageOrder64 code) {
+    IE_ASSERT(isOrderCodeValid(code));
+    DimsOrder out;
+    out._code = code;
+    return out;
+}
+
+DimsOrder DimsOrder::fromNumDims(int numDims) {
+    static const StorageOrder64 FULL_ORDER_DEFAULT =
+            maskOrder(static_cast<StorageOrder64>(0x0fedcba987654321ull), MAX_DIMS_64);
+
+    if (numDims == 1) {
+        return DimsOrder::C;
+    } else if (numDims == 2) {
+        return DimsOrder::NC;
+    } else {
+        return DimsOrder::fromCode(maskOrder(FULL_ORDER_DEFAULT, numDims));
+    }
+}
+
+DimsOrder DimsOrder::fromPermutation(const std::vector<Dim>& perm) {
+    StorageOrder64 code = 0;
+
+    for (int sh = 0, i = 0; i < perm.size(); i++, sh += 4) {
+        code += (((static_cast<StorageOrder64>(perm[i]) + 1ull) & 0xFull) << sh);
+    }
+
+    return DimsOrder::fromCode(code);
+}
+
+int DimsOrder::numDims() const {
+    int out = 0;
+
+    auto code = _code;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        auto digit = code & 0xF;
+        if (digit == 0)
+            break;
+
+        ++out;
+
+        code >>= 4;
+    }
+
+    return out;
+}
+
+bool DimsOrder::hasDim(Dim d) const {
+    auto dimDigit = static_cast<int>(d) + 1;
+
+    auto code = _code;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        auto digit = code & 0xF;
+        if (digit == 0)
+            break;
+
+        if (digit == dimDigit) {
+            return true;
+        }
+
+        code >>= 4;
+    }
+
+    return false;
+}
+
+int DimsOrder::dimInd(Dim d) const {
+    auto dimDigit = static_cast<int>(d) + 1;
+
+    auto code = _code;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        auto digit = code & 0xF;
+        if (digit == 0)
+            break;
+
+        if (digit == dimDigit) {
+            return i;
+        }
+
+        code >>= 4;
+    }
+
+    VPU_THROW_EXCEPTION << "Dim " << d << " is not avaialble in layout " << toString(*this);
+}
+
+std::vector<Dim> DimsOrder::toPermutation() const {
+    std::vector<Dim> out;
+    out.reserve(MAX_DIMS_64);
+
+    auto code = _code;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        auto digit = code & 0xF;
+        if (digit == 0)
+            break;
+
+        auto d = static_cast<Dim>(digit - 1);
+        out.emplace_back(d);
+
+        code >>= 4;
+    }
+
+    return out;
+}
+
+DimValues DimsOrder::toIndices() const {
+    DimValues out;
+
+    auto code = _code;
+
+    for (int i = 0; i < MAX_DIMS_64; i++) {
+        auto digit = code & 0xF;
+        if (digit == 0)
+            break;
+
+        auto d = static_cast<Dim>(digit - 1);
+        out.set(d, i);
+
+        code >>= 4;
+    }
+
+    return out;
+}
+
+void DimsOrder::moveDim(Dim dim, int newPos) {
+    IE_ASSERT(newPos >= 0 && newPos < numDims());
+
+    int oldPos = dimInd(dim);
+    if (oldPos == newPos)
+        return;
+
+    auto step = (oldPos > newPos) ? -1 : 1;
+
+    auto perm = toPermutation();
+    IE_ASSERT(newPos < perm.size());
+
+    for (int i = oldPos; i != newPos; i += step) {
+        perm[i] = perm[i + step];
+    }
+
+    perm[newPos] = dim;
+
+    _code = fromPermutation(perm).code();
+}
+
+DimsOrder DimsOrder::createMovedDim(Dim dim, int newPos) const {
+    auto copy = *this;
+    copy.moveDim(dim, newPos);
+    return copy;
+}
+
+bool isOrdersCompatible(DimsOrder order1, DimsOrder order2) {
+    auto vec1 = order1.toPermutation();
+    auto vec2 = order2.toPermutation();
+
+    std::sort(vec1.begin(), vec1.end());
+    std::sort(vec2.begin(), vec2.end());
+
+    return vec1 == vec2;
+}
+
+void printTo(std::ostream& os, DimsOrder order) {
+    static std::unordered_map<int, char> DIM_NAMES({
+        {1, 'W'},
+        {2, 'H'},
+        {3, 'C'},
+        {4, 'N'}
+    });
+
+    auto code = order.code();
+
+    int i = MAX_DIMS_64 - 1;
+
+    for (; i >= 0; i--) {
+        auto curDim = (code >> (i * 4)) & 0xF;
+
+        if (curDim != 0)
+            break;
+    }
+
+    for (; i >= 0; i--) {
+        auto curDim = (code >> (i * 4)) & 0xF;
+
+        auto it = DIM_NAMES.find(curDim);
+        if (it != DIM_NAMES.end()) {
+            os << it->second;
+        } else {
+            os << curDim;
+        }
+    }
+}
+
+//
+// DataDesc
+//
+
+DataDesc::DataDesc(const ie::TensorDesc& ieDesc) {
+    //
+    // Parse precision
+    //
+
+    switch (ieDesc.getPrecision()) {
+    case ie::Precision::U8:
+        _type = DataType::U8;
+        break;
+    case ie::Precision::FP16:
+        _type = DataType::FP16;
+        break;
+    case ie::Precision::FP32:
+        _type = DataType::FP32;
+        break;
+    default:
+        VPU_THROW_EXCEPTION << "Unsupported precision " << ieDesc.getPrecision().name();
+    }
+
+    //
+    // Parse dimensions and layout
+    //
+
+    const auto& ieDims = ieDesc.getDims();
+    IE_ASSERT(!ieDims.empty());
+
+    _dimsOrder = DimsOrder::fromNumDims(ieDims.size());
+
+    auto perm = _dimsOrder.toPermutation();
+
+    for (int i = 0; i < perm.size(); ++i) {
+        _dims.set(perm[i], ieDims[ieDims.size() - 1 - i]);
+    }
+}
+
+DataDesc::DataDesc(DataType type, DimsOrder dimsOrder, const DimValues& dims) :
+        _type(type), _dimsOrder(dimsOrder), _dims(dims) {
+    IE_ASSERT(_dimsOrder.numDims() == _dims.size());
+    for (const auto& p : _dims) {
+        IE_ASSERT(_dimsOrder.hasDim(p.first));
+    }
+}
+
+int DataDesc::elemSize() const {
+    switch (_type) {
+    case DataType::U8:
+        return sizeof(uint8_t);
+    case DataType::FP16:
+        return sizeof(fp16_t);
+    case DataType::FP32:
+        return sizeof(float);
+    default:
+        VPU_THROW_EXCEPTION << "Unknown data type " << _type;
+    }
+}
+
+void DataDesc::setDim(Dim d, int val) {
+    IE_ASSERT(_dimsOrder.hasDim(d));
+    _dims.set(d, val);
+}
+
+int DataDesc::totalDimSize() const {
+    int total = 1;
+
+    auto perm = _dimsOrder.toPermutation();
+    for (auto d : perm) {
+        total *= _dims[d];
+    }
+
+    return total;
+}
+
+void DataDesc::reorder(DimsOrder dimsOrder) {
+    IE_ASSERT(isOrdersCompatible(_dimsOrder, dimsOrder));
+    _dimsOrder = dimsOrder;
+}
+
+void printTo(std::ostream& os, const DataDesc& desc) {
+    os << "[" << std::endl;
+
+    os << "type=";
+    printTo(os, desc.type());
+    os << std::endl;
+
+    os << "dimsOrder=";
+    printTo(os, desc.dimsOrder());
+    os << std::endl;
+
+    os << "dims=";
+    printTo(os, desc.dims());
+    os << std::endl;
+
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const DataDesc& desc) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("type", desc.type());
+    subLbl.appendPair("dimsOrder", desc.dimsOrder());
+    subLbl.appendPair("dims", desc.dims());
+}
+
+//
+// StridesRequirement
+//
+
+StridesRequirement StridesRequirement::compact() {
+    StridesRequirement reqs;
+    for (int i = 0; i < MAX_DIMS_64; ++i) {
+        reqs.add(i, DimStride::Compact);
+    }
+    return reqs;
+}
+
+void printTo(std::ostream& os, const StridesRequirement& reqs) {
+    os << "[" << std::endl;
+
+    for (int i = 0; i < MAX_DIMS_64; ++i) {
+        auto req = reqs.get(i);
+        if (req != DimStride::Any) {
+            printTo(os, i);
+            os << "=";
+            printTo(os, req);
+            os << std::endl;
+        }
+    }
+
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const StridesRequirement& reqs) {
+    DotLabel subLbl(lbl);
+    for (int i = 0; i < MAX_DIMS_64; ++i) {
+        auto req = reqs.get(i);
+        if (req != DimStride::Any) {
+            subLbl.appendPair(i, req);
+        }
+    }
+}
+
+namespace {
+
+int applyStrideRequirement(int origStride, int index, const StridesRequirement& reqs) {
+    auto req = reqs.get(index);
+
+    if (req == DimStride::Any || req == DimStride::Compact) {
+        return origStride;
+    } else if (req == DimStride::Aligned) {
+        return alignVal(origStride, STRIDE_ALIGNMENT);
+    } else {
+        VPU_THROW_EXCEPTION << "Unknown stride requirement : " << req;
+    }
+}
+
+}  // namespace
+
+DimValues calcStrides(const DataDesc& desc, const StridesRequirement& reqs) {
+    DimValues strides;
+
+    auto perm = desc.dimsOrder().toPermutation();
+    IE_ASSERT(!perm.empty());
+
+    strides.set(perm[0], desc.elemSize());
+    strides.set(perm[0], applyStrideRequirement(strides[perm[0]], 0, reqs));
+
+    for (int i = 1; i < perm.size(); i++) {
+        strides.set(perm[i], strides[perm[i - 1]] * desc.dim(perm[i - 1]));
+        strides.set(perm[i], applyStrideRequirement(strides[perm[i]], i, reqs));
+    }
+
+    return strides;
+}
+
+bool checkStride(
+        const DimValues& strides,
+        const DataDesc& desc,
+        int ind,
+        DimStride req) {
+    if (req == DimStride::Any) {
+        return true;
+    }
+
+    auto perm = desc.dimsOrder().toPermutation();
+    IE_ASSERT(!perm.empty());
+
+    auto strideVal = strides[perm[ind]];
+
+    if (req == DimStride::Compact) {
+        if (ind == 0) {
+            if (strideVal != desc.elemSize()) {
+                return false;
+            }
+        } else {
+            if (strides[perm[ind]] != strides[perm[ind - 1]] * desc.dim(perm[ind - 1])) {
+                return false;
+            }
+        }
+    } else if (req == DimStride::Aligned) {
+        if (strideVal % STRIDE_ALIGNMENT != 0) {
+            return false;
+        }
+    } else {
+        VPU_THROW_EXCEPTION << "Unsupported stride requirement : " << req;
+    }
+
+    return true;
+}
+
+bool checkStrides(
+        const DataDesc& desc,
+        const DimValues& strides,
+        const StridesRequirement& reqs) {
+    auto perm = desc.dimsOrder().toPermutation();
+    IE_ASSERT(!perm.empty());
+
+    for (int i = 0; i < perm.size(); i++) {
+        if (!checkStride(strides, desc, i, reqs.get(i))) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+int calcTotalByteSize(const DataDesc& desc, const DimValues& strides) {
+    auto perm = desc.dimsOrder().toPermutation();
+    return strides[perm.back()] * desc.dim(perm.back());
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/model.cpp b/inference-engine/src/vpu/graph_transformer/src/model/model.cpp
new file mode 100644 (file)
index 0000000..2d3332c
--- /dev/null
@@ -0,0 +1,1737 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/model.hpp>
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <set>
+#include <exception>
+#include <algorithm>
+
+#include <details/caseless.hpp>
+#include <vpu/utils/auto_scope.hpp>
+
+namespace vpu {
+
+//
+// Resources
+//
+
+void printTo(std::ostream& os, const Resources& res) {
+    os << "[" << std::endl;
+
+    os << "numCMXSlices=" << res.numCMXSlices << std::endl;
+    os << "numSHAVEs=" << res.numSHAVEs << std::endl;
+    os << "cmxLimit=" << res.cmxLimit << std::endl;
+
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const Resources& res) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("numCMXSlices", res.numCMXSlices);
+    subLbl.appendPair("numSHAVEs", res.numSHAVEs);
+    subLbl.appendPair("cmxLimit", res.cmxLimit);
+}
+
+//
+// Model
+//
+
+void Model::setBatchSize(int batchSize) {
+    // Check `batchSize` value.
+    IE_ASSERT(batchSize >= 1);
+
+    _batchSize = batchSize;
+    _allocator.setBatchSize(batchSize);
+}
+
+void Model::setNumberOfSubGraphs(int numberOfSubGraphs) {
+    // Check `numberOfSubGraphs` value.
+    IE_ASSERT(numberOfSubGraphs >= 1);
+
+    _numberOfSubGraphs = numberOfSubGraphs;
+}
+
+Data Model::addInputData(
+        const std::string& name,
+        const DataDesc& desc) {
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = name;
+    data->_usage = DataUsage::Input;
+    data->_desc = desc;
+    data->_model = handle_from_this();
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    _allocator.setNeedToAllocNonIntermData();
+
+    return data;
+}
+
+Data Model::addOutputData(
+        const std::string& name,
+        const DataDesc& desc) {
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = name;
+    data->_usage = DataUsage::Output;
+    data->_desc = desc;
+    data->_model = handle_from_this();
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    _allocator.setNeedToAllocNonIntermData();
+
+    return data;
+}
+
+Data Model::addConstData(
+        const std::string& name,
+        const DataDesc& desc,
+        const DataContent::Ptr& content) {
+    IE_ASSERT(content != nullptr);
+
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = name;
+    data->_usage = DataUsage::Const;
+    data->_desc = desc;
+    data->_model = handle_from_this();
+
+    data->_content = content;
+    content->_desc = desc;
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    _allocator.setNeedToAllocNonIntermData();
+
+    return data;
+}
+
+Data Model::addNewData(
+        const std::string& name,
+        const DataDesc& desc) {
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = name;
+    data->_usage = DataUsage::Intermediate;
+    data->_desc = desc;
+    data->_model = handle_from_this();
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    return data;
+}
+
+Data Model::addFakeData() {
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = "<fake>";
+    data->_usage = DataUsage::Fake;
+    data->_desc = DataDesc({1});
+    data->_model = handle_from_this();
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    return data;
+}
+
+Data Model::duplicateData(
+        const Data& origData,
+        const std::string& postfix,
+        const DataDesc& newDesc,
+        const DataContent::Ptr& newContent) {
+    //
+    // Check that the objects belong to the same Model.
+    //
+
+    IE_ASSERT(origData->_model.get() == this);
+
+    //
+    // Duplicate Data node.
+    //
+
+    auto newDataUsage = origData->usage();
+    if (newDataUsage == DataUsage::Input ||
+        newDataUsage == DataUsage::Output) {
+        // Duplicates for Input & Output can be only Intermediate
+        newDataUsage = DataUsage::Intermediate;
+    }
+
+    std::shared_ptr<DataNode> newData(new DataNode);
+
+    newData->_name = origData->name() + postfix;
+    newData->_usage = newDataUsage;
+    newData->_desc = newDesc.numDims() != 0 ? newDesc : origData->desc();
+    newData->_model = handle_from_this();
+
+    if (newDataUsage == DataUsage::Const) {
+        newData->_content = newContent != nullptr ? newContent : origData->content();
+        if (newContent != nullptr) {
+            newContent->_desc = newData->_desc;
+        }
+    }
+
+    newData->attrs().copyFrom(origData->attrs());
+
+    newData->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), newData);
+    _dataList.push_back(newData);
+
+    return newData;
+}
+
+Stage Model::duplicateStage(
+        const std::string& name,
+        const Stage& origStage,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    //
+    // Check that the new Stage has inputs and outputs.
+    //
+
+    IE_ASSERT(!inputs.empty());
+    IE_ASSERT(!outputs.empty());
+
+    //
+    // Check that the objects belong to the same Model.
+    //
+
+    IE_ASSERT(origStage->_model.get() == this);
+
+    for (const auto& input : inputs) {
+        IE_ASSERT(input->_model.get() == this);
+    }
+
+    for (const auto& output : outputs) {
+        IE_ASSERT(output->_model.get() == this);
+    }
+
+    //
+    // Check that there are no loops.
+    //
+
+    // TODO: more advanced check.
+    for (const auto& output : outputs) {
+        for (const auto& input : inputs) {
+            IE_ASSERT(input != output);
+        }
+    }
+
+    //
+    // Create new Stage.
+    //
+
+    _resetStageOrder = true;;
+
+    auto stage = origStage->cloneImpl();
+
+    stage->_name = name;
+    stage->_type = origStage->_type;
+    stage->_origLayer = origStage->_origLayer;
+    stage->_model = handle_from_this();
+
+    _initialStages.emplace(stage);
+
+    for (const auto& input : inputs) {
+        addStageInput(stage, input);
+    }
+    for (const auto& output : outputs) {
+        addStageOutput(stage, output);
+    }
+    for (const auto& tempBufferEdge : origStage->_tempBufferEdges) {
+        addTempBuffer(stage, tempBufferEdge->tempBuffer()->desc());
+    }
+
+    stage->_ptrPosInModel = _stagePtrList.emplace(_stagePtrList.end(), stage);
+
+    return stage;
+}
+
+StageInput Model::addStageInput(
+        const Stage& stage,
+        const Data& data) {
+    //
+    // Check that the objects belong to the same Model.
+    //
+
+    IE_ASSERT(stage->_model.get() == this);
+    IE_ASSERT(data->_model.get() == this);
+
+    // TODO: check for loops in the graph.
+
+    //
+    // Input data can't be Temp.
+    //
+
+    IE_ASSERT(data->_usage != DataUsage::Temp);
+
+    //
+    // Create new Edge.
+    //
+
+    _resetStageOrder = true;;
+
+    std::shared_ptr<StageInputEdge> edge(new StageInputEdge);
+
+    edge->_consumer = stage;
+    edge->_input = data;
+    edge->_portInd = stage->_inputEdges.size();
+    edge->_model = handle_from_this();
+
+    edge->_ptrPosInModel = _inEdgePtrList.emplace(_inEdgePtrList.end(), edge);
+    data->_consumerEdges.push_back(edge);
+    stage->_inputEdges.emplace_back(edge);
+
+    //
+    // Stage order helpers
+    //
+
+    if (data->_producerEdge != nullptr) {
+        IE_ASSERT(stage->_parentStageEdge == nullptr);
+        IE_ASSERT(data->_producerEdge->_producer->_parentStageEdge == nullptr);
+        ++data->_producerEdge->_producer->_nextStages[stage];
+        ++stage->_prevStages[data->_producerEdge->_producer];
+    }
+
+    if (stage->_prevStages.empty()) {
+        _initialStages.emplace(stage);
+    } else {
+        _initialStages.erase(stage);
+    }
+
+    return edge;
+}
+
+StageOutput Model::addStageOutput(
+        const Stage& stage,
+        const Data& data) {
+    //
+    // Check that the objects belong to the same Model.
+    //
+
+    IE_ASSERT(stage->_model.get() == this);
+    IE_ASSERT(data->_model.get() == this);
+
+    //
+    // Check that the `data` is free.
+    //
+
+    IE_ASSERT(data->_producerEdge == nullptr);
+
+    if (data->_parentDataEdge != nullptr) {
+        IE_ASSERT(data->_parentDataEdge->_order != SharedDataOrder::ParentWritesToChild);
+    }
+
+    for (const auto& childDataEdge : data->_childDataEdges) {
+        IE_ASSERT(childDataEdge->_order != SharedDataOrder::ChildWritesToParent);
+    }
+
+    //
+    // Output data can be Output, Intermediate, or Fake only.
+    //
+
+    IE_ASSERT(data->_usage == DataUsage::Output || data->_usage == DataUsage::Intermediate || data->_usage == DataUsage::Fake);
+
+    // TODO: check for loops in the graph.
+
+    _resetStageOrder = true;;
+
+    std::shared_ptr<StageOutputEdge> edge(new StageOutputEdge);
+
+    edge->_producer = stage;
+    edge->_output = data;
+    edge->_portInd = stage->_outputEdges.size();
+    edge->_model = handle_from_this();
+
+    edge->_ptrPosInModel = _outEdgePtrList.emplace(_outEdgePtrList.end(), edge);
+    stage->_outputEdges.emplace_back(edge);
+    data->_producerEdge = edge;
+
+    //
+    // Stage order helpers
+    //
+
+    for (const auto& consumerEdge : data->_consumerEdges) {
+        IE_ASSERT(stage->_parentStageEdge == nullptr);
+        IE_ASSERT(consumerEdge->_consumer->_parentStageEdge == nullptr);
+        ++consumerEdge->_consumer->_prevStages[stage];
+        ++stage->_nextStages[consumerEdge->_consumer];
+
+        _initialStages.erase(consumerEdge->_consumer);
+    }
+
+    return edge;
+}
+
+StageTempBuffer Model::addTempBuffer(
+        const Stage& stage,
+        const DataDesc& desc) {
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(stage->_model.get() == this);
+
+    //
+    // Create new Data.
+    //
+
+    std::shared_ptr<DataNode> data(new DataNode);
+
+    data->_name = formatString("%s@temp@%d", stage->name(), stage->_tempBufferEdges.size() + 1);
+    data->_usage = DataUsage::Temp;
+    data->_desc = desc;
+    data->_model = handle_from_this();
+
+    data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
+    _dataList.push_back(data);
+
+    //
+    // Create new Edge.
+    //
+
+    std::shared_ptr<StageTempBufferEdge> edge(new StageTempBufferEdge);
+
+    edge->_stage = stage;
+    edge->_tempBuffer = data;
+    edge->_portInd = stage->_tempBufferEdges.size();
+    edge->_model = handle_from_this();
+
+    edge->_ptrPosInModel = _tempBufferEdgePtrList.emplace(_tempBufferEdgePtrList.end(), edge);
+    stage->_tempBufferEdges.emplace_back(edge);
+    data->_tempBufferEdge = edge;
+
+    return edge;
+}
+
+void Model::replaceStageInput(
+        const StageInput& edge,
+        const Data& newInput) {
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(edge->_model.get() == this);
+    IE_ASSERT(newInput->_model.get() == this);
+
+    //
+    // Check that there are no loops.
+    //
+
+    // TODO: more advanced check.
+    for (const auto& output : edge->consumer()->outputs()) {
+        IE_ASSERT(newInput != output);
+    }
+
+    //
+    // Input data can't be Temp.
+    //
+
+    IE_ASSERT(newInput->_usage != DataUsage::Temp);
+
+    //
+    // Can't replace Edge from injected Stage.
+    //
+
+    IE_ASSERT(edge->_parentEdge == nullptr);
+    IE_ASSERT(edge->_childEdge == nullptr);
+
+    //
+    // Edge change affects the Stage order.
+    //
+
+    _resetStageOrder = true;;
+
+    //
+    // Remove Edge from previous input.
+    //
+
+    edge->_input->_consumerEdges.erase(edge);
+
+    //
+    // Previous stage order helpers
+    //
+
+    if (edge->_input->_producerEdge != nullptr) {
+        auto it1 = edge->_input->_producerEdge->_producer->_nextStages.find(edge->_consumer);
+        IE_ASSERT(it1 != edge->_input->_producerEdge->_producer->_nextStages.end());
+        --it1->second;
+        if (it1->second <= 0) {
+            edge->_input->_producerEdge->_producer->_nextStages.erase(it1);
+        }
+
+        auto it2 = edge->_consumer->_prevStages.find(edge->_input->_producerEdge->_producer);
+        IE_ASSERT(it2 != edge->_consumer->_prevStages.end());
+        --it2->second;
+        if (it2->second <= 0) {
+            edge->_consumer->_prevStages.erase(it2);
+        }
+    }
+
+    //
+    // Set new input.
+    //
+
+    edge->_input = newInput;
+    newInput->_consumerEdges.push_back(edge);
+
+    //
+    // Stage order helpers
+    //
+
+    if (newInput->_producerEdge != nullptr) {
+        IE_ASSERT(edge->_consumer->_parentStageEdge == nullptr);
+        IE_ASSERT(newInput->_producerEdge->_producer->_parentStageEdge == nullptr);
+        ++newInput->_producerEdge->_producer->_nextStages[edge->_consumer];
+        ++edge->_consumer->_prevStages[newInput->_producerEdge->_producer];
+
+        _initialStages.erase(edge->_consumer);
+    }
+
+    if (edge->_consumer->_prevStages.empty()) {
+        _initialStages.emplace(edge->_consumer);
+    } else {
+        _initialStages.erase(edge->_consumer);
+    }
+}
+
+void Model::replaceStageOutput(
+        const StageOutput& edge,
+        const Data& newOutput) {
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(edge->_model.get() == this);
+    IE_ASSERT(newOutput->_model.get() == this);
+
+    //
+    // Check that there are no loops.
+    //
+
+    // TODO: more advanced check.
+    for (const auto& input : edge->producer()->inputs()) {
+        IE_ASSERT(newOutput != input);
+    }
+
+    //
+    // Check that `data` is free.
+    //
+
+    IE_ASSERT(newOutput->_producerEdge == nullptr);
+
+    if (newOutput->_parentDataEdge != nullptr) {
+        IE_ASSERT(newOutput->_parentDataEdge->_order != SharedDataOrder::ParentWritesToChild);
+    }
+
+    for (const auto& childDataEdge : newOutput->_childDataEdges) {
+        IE_ASSERT(childDataEdge->_order != SharedDataOrder::ChildWritesToParent);
+    }
+
+    //
+    // Output data can be Output/Intermediate/Fake.
+    //
+
+    IE_ASSERT(newOutput->_usage == DataUsage::Output ||
+              newOutput->_usage == DataUsage::Intermediate ||
+              newOutput->_usage == DataUsage::Fake);
+
+    //
+    // Can't replace Edge from injected Stage.
+    //
+
+    IE_ASSERT(edge->_parentEdge == nullptr);
+    IE_ASSERT(edge->_childEdge == nullptr);
+
+    //
+    // Edge change affects the Stage order.
+    //
+
+    _resetStageOrder = true;;
+
+    //
+    // Remove Edge from previous output.
+    //
+
+    edge->_output->_producerEdge = nullptr;
+
+    //
+    // Previous stage order helpers
+    //
+
+    for (const auto& consumerEdge : edge->_output->_consumerEdges) {
+        auto it1 = consumerEdge->_consumer->_prevStages.find(edge->_producer);
+        IE_ASSERT(it1 != consumerEdge->_consumer->_prevStages.end());
+        --it1->second;
+        if (it1->second <= 0) {
+            consumerEdge->_consumer->_prevStages.erase(it1);
+        }
+
+        auto it2 = edge->_producer->_nextStages.find(consumerEdge->_consumer);
+        IE_ASSERT(it2 != edge->_producer->_nextStages.end());
+        --it2->second;
+        if (it2->second <= 0) {
+            edge->_producer->_nextStages.erase(it2);
+        }
+
+        if (consumerEdge->_consumer->_prevStages.empty()) {
+            _initialStages.emplace(consumerEdge->_consumer);
+        } else {
+            _initialStages.erase(consumerEdge->_consumer);
+        }
+    }
+
+    //
+    // Set new output.
+    //
+
+    edge->_output = newOutput;
+    newOutput->_producerEdge = edge;
+
+    //
+    // Stage order helpers
+    //
+
+    for (const auto& consumerEdge : newOutput->_consumerEdges) {
+        IE_ASSERT(edge->_producer->_parentStageEdge == nullptr);
+        IE_ASSERT(consumerEdge->_consumer->_parentStageEdge == nullptr);
+        ++consumerEdge->_consumer->_prevStages[edge->_producer];
+        ++edge->_producer->_nextStages[consumerEdge->_consumer];
+
+        _initialStages.erase(consumerEdge->_consumer);
+    }
+}
+
+Model::InjectStageHelper::~InjectStageHelper() {
+    //
+    // Check that `done` was called.
+    //
+
+    if (_model != nullptr) {
+        std::terminate();
+    }
+}
+
+Model::InjectStageHelper& Model::InjectStageHelper::parentHW(const Stage& parent) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `parentHW` was not called.
+    //
+
+    IE_ASSERT(_parent == nullptr);
+
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(parent->_model == _model);
+
+    //
+    // Check that `parent` is HW.
+    //
+
+    IE_ASSERT(parent->category() == StageCategory::HW);
+
+    _parent = parent;
+
+    return *this;
+}
+
+Model::InjectStageHelper& Model::InjectStageHelper::childSW(const Stage& child) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `childSW` was not called.
+    //
+
+    IE_ASSERT(_child == nullptr);
+
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(child->_model == _model);
+
+    //
+    // Check that `parent` is HW.
+    //
+
+    IE_ASSERT(child->category() == StageCategory::DMA || child->category() == StageCategory::SHAVE);
+
+    _child = child;
+
+    return *this;
+}
+
+InjectedStage Model::InjectStageHelper::done() {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that all fields were set.
+    //
+
+    IE_ASSERT(_parent != nullptr);
+    IE_ASSERT(_child != nullptr);
+
+    //
+    // Call actual implementation.
+    //
+
+    auto edge = _model->injectStageImpl(_parent, _child);
+
+    //
+    // Reset the internal state.
+    //
+
+    _model = nullptr;
+
+    return edge;
+}
+
+InjectedStage Model::injectStageImpl(
+        const Stage& parent,
+        const Stage& child) {
+    //
+    // Check the parent and child was not already injected.
+    //
+
+    IE_ASSERT(parent->_parentStageEdge == nullptr);
+
+    IE_ASSERT(child->_parentStageEdge == nullptr);
+    IE_ASSERT(child->_injectedStageEdges.empty());
+
+    //
+    // New Edge affects the Stage order.
+    //
+
+    _resetStageOrder = true;;
+
+    _initialStages.erase(child);
+
+    //
+    // Create new Edge.
+    //
+
+    std::shared_ptr<InjectedStageEdge> edge(new InjectedStageEdge);
+
+    edge->_parent = parent;
+    edge->_child = child.lock();
+    edge->_portInd = parent->_injectedStageEdges.size();
+    edge->_model = handle_from_this();
+
+    edge->_ptrPosInModel = _stageEdgePtrList.emplace(_stageEdgePtrList.end(), edge);
+    parent->_injectedStageEdges.push_back(edge);
+
+    child->_parentStageEdge = edge;
+
+    //
+    // Redirect child inputs to parent.
+    //
+
+    for (const auto& childInEdge : child->_inputEdges) {
+        if (childInEdge->_input->_producerEdge != nullptr) {
+            auto it1 = childInEdge->_input->_producerEdge->_producer->_nextStages.find(childInEdge->_consumer);
+            IE_ASSERT(it1 != childInEdge->_input->_producerEdge->_producer->_nextStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                childInEdge->_input->_producerEdge->_producer->_nextStages.erase(it1);
+            }
+
+            auto it2 = childInEdge->_consumer->_prevStages.find(childInEdge->_input->_producerEdge->_producer);
+            IE_ASSERT(it2 != childInEdge->_consumer->_prevStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                childInEdge->_consumer->_prevStages.erase(it2);
+            }
+        }
+
+        childInEdge->_input->_consumerEdges.erase(childInEdge);
+
+        auto parentInEdge = addStageInput(parent, childInEdge->_input);
+
+        childInEdge->_parentEdge = parentInEdge;
+        parentInEdge->_childEdge = childInEdge;
+    }
+
+    //
+    // Redirect child outputs to parent.
+    //
+
+    for (const auto& childOutEdge : child->_outputEdges) {
+        for (const auto& consumerEdge : childOutEdge->_output->_consumerEdges) {
+            auto it1 = consumerEdge->_consumer->_prevStages.find(childOutEdge->_producer);
+            IE_ASSERT(it1 != consumerEdge->_consumer->_prevStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                consumerEdge->_consumer->_prevStages.erase(it1);
+            }
+
+            auto it2 = childOutEdge->_producer->_nextStages.find(consumerEdge->_consumer);
+            IE_ASSERT(it2 != childOutEdge->_producer->_nextStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                childOutEdge->_producer->_nextStages.erase(it2);
+            }
+        }
+
+        childOutEdge->_output->_producerEdge = nullptr;
+
+        auto parentOutEdge = addStageOutput(parent, childOutEdge->_output);
+
+        childOutEdge->_parentEdge = parentOutEdge;
+        parentOutEdge->_childEdge = childOutEdge;
+    }
+
+    //
+    // Redirect child temp buffers to parent.
+    //
+
+    for (const auto& childEdge : child->_tempBufferEdges) {
+        childEdge->_tempBuffer->_tempBufferEdge = nullptr;
+
+        std::shared_ptr<StageTempBufferEdge> parentEdge(new StageTempBufferEdge);
+
+        parentEdge->_stage = parent;
+        parentEdge->_tempBuffer = childEdge->_tempBuffer;
+        parentEdge->_portInd = parent->_tempBufferEdges.size();
+        parentEdge->_model = handle_from_this();
+
+        parentEdge->_ptrPosInModel = _tempBufferEdgePtrList.emplace(_tempBufferEdgePtrList.end(), parentEdge);
+
+        parent->_tempBufferEdges.emplace_back(parentEdge);
+        childEdge->_tempBuffer->_tempBufferEdge = parentEdge;
+
+        childEdge->_parentEdge = parentEdge;
+        parentEdge->_childEdge = childEdge;
+    }
+
+    //
+    // Move child Stage from the Model to parent Stage.
+    //
+
+    IE_ASSERT(child->_ptrPosInModel != _stagePtrList.end());
+    _stagePtrList.erase(child->_ptrPosInModel);
+    child->_ptrPosInModel = _stagePtrList.end();
+
+    if (parent->_prevStages.empty()) {
+        _initialStages.emplace(parent);
+    } else {
+        _initialStages.erase(parent);
+    }
+
+    return edge;
+}
+
+void Model::revertInjection(const InjectedStage& edge) {
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(edge->_model.get() == this);
+
+    auto parentStage = edge->_parent;
+    auto childStage = edge->_child;
+
+    IE_ASSERT(parentStage->_model.get() == this);
+    IE_ASSERT(childStage->_model.get() == this);
+    IE_ASSERT(childStage->_parentStageEdge == edge);
+
+    //
+    // The revert affects the Stage order.
+    //
+
+    _resetStageOrder = true;;
+
+    //
+    // Move child Stage from parent Stage to the Model.
+    //
+
+    childStage->_ptrPosInModel = _stagePtrList.emplace(_stagePtrList.end(), childStage);
+
+    //
+    // Remove InjectedStage Edge from parent and child Stage.
+    //
+
+    parentStage->_injectedStageEdges.erase(edge);
+    childStage->_parentStageEdge = nullptr;
+
+    //
+    // Remove Injected Input Edges from parent Stage.
+    //
+
+    int startInd = -1;
+    int endInd = -1;
+
+    for (const auto& inEdge : parentStage->_inputEdges) {
+        if (inEdge->_childEdge == nullptr) {
+            IE_ASSERT(startInd < 0);
+            continue;
+        }
+
+        if (startInd >= 0 && endInd >= 0) {
+            IE_ASSERT(inEdge->_childEdge->_consumer != childStage);
+        }
+
+        if (inEdge->_childEdge->_consumer != childStage) {
+            if (startInd >= 0 && endInd < 0) {
+                endInd = inEdge->_portInd;
+            }
+            continue;
+        }
+
+        if (startInd < 0) {
+            startInd = inEdge->_portInd;
+        }
+        if (inEdge->_portInd == parentStage->_inputEdges.size() - 1) {
+            endInd = inEdge->_portInd + 1;
+        }
+
+        if (inEdge->_input->_producerEdge != nullptr) {
+            auto it1 = inEdge->_input->_producerEdge->_producer->_nextStages.find(inEdge->_consumer);
+            IE_ASSERT(it1 != inEdge->_input->_producerEdge->_producer->_nextStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                inEdge->_input->_producerEdge->_producer->_nextStages.erase(it1);
+            }
+
+            auto it2 = inEdge->_consumer->_prevStages.find(inEdge->_input->_producerEdge->_producer);
+            IE_ASSERT(it2 != inEdge->_consumer->_prevStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                inEdge->_consumer->_prevStages.erase(it2);
+            }
+        }
+
+        if (inEdge->_childEdge->_input->_producerEdge != nullptr) {
+            IE_ASSERT(inEdge->_childEdge->_consumer->_parentStageEdge == nullptr);
+            IE_ASSERT(inEdge->_childEdge->_input->_producerEdge->_producer->_parentStageEdge == nullptr);
+            ++inEdge->_childEdge->_input->_producerEdge->_producer->_nextStages[inEdge->_childEdge->_consumer];
+            ++inEdge->_childEdge->_consumer->_prevStages[inEdge->_childEdge->_input->_producerEdge->_producer];
+        }
+
+        inEdge->_childEdge->_parentEdge = nullptr;
+        inEdge->_input->_consumerEdges.erase(inEdge);
+        inEdge->_input->_consumerEdges.push_back(inEdge->_childEdge);
+
+        IE_ASSERT(inEdge->_ptrPosInModel != _inEdgePtrList.end());
+        _inEdgePtrList.erase(inEdge->_ptrPosInModel);
+    }
+
+    IE_ASSERT(startInd >= 0 && endInd > startInd && startInd <= parentStage->_inputEdges.size());
+    parentStage->_inputEdges.erase(
+        parentStage->_inputEdges.begin() + startInd,
+        parentStage->_inputEdges.begin() + endInd);
+
+    for (int i = 0; i < parentStage->_inputEdges.size(); ++i) {
+        parentStage->_inputEdges[i]->_portInd = i;
+    }
+
+    //
+    // Remove Injected Output Edges from parent Stage.
+    //
+
+    startInd = -1;
+    endInd = -1;
+
+    for (const auto& outEdge : parentStage->_outputEdges) {
+        if (outEdge->_childEdge == nullptr) {
+            IE_ASSERT(startInd < 0);
+            continue;
+        }
+
+        if (startInd >= 0 && endInd >= 0) {
+            IE_ASSERT(outEdge->_childEdge->_producer != childStage);
+        }
+
+        if (outEdge->_childEdge->_producer != childStage) {
+            if (startInd >= 0 && endInd < 0) {
+                endInd = outEdge->_portInd;
+            }
+            continue;
+        }
+
+        if (startInd < 0) {
+            startInd = outEdge->_portInd;
+        }
+        if (outEdge->_portInd == parentStage->_outputEdges.size() - 1) {
+            endInd = outEdge->_portInd + 1;
+        }
+
+        for (const auto& consumerEdge : outEdge->_output->_consumerEdges) {
+            auto it1 = consumerEdge->_consumer->_prevStages.find(outEdge->_producer);
+            IE_ASSERT(it1 != consumerEdge->_consumer->_prevStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                consumerEdge->_consumer->_prevStages.erase(it1);
+            }
+
+            auto it2 = outEdge->_producer->_nextStages.find(consumerEdge->_consumer);
+            IE_ASSERT(it2 != outEdge->_producer->_nextStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                outEdge->_producer->_nextStages.erase(it2);
+            }
+        }
+
+        for (const auto& consumerEdge : outEdge->_childEdge->_output->_consumerEdges) {
+            IE_ASSERT(outEdge->_childEdge->_producer->_parentStageEdge == nullptr);
+            IE_ASSERT(consumerEdge->_consumer->_parentStageEdge == nullptr);
+            ++consumerEdge->_consumer->_prevStages[outEdge->_childEdge->_producer];
+            ++outEdge->_childEdge->_producer->_nextStages[consumerEdge->_consumer];
+        }
+
+        outEdge->_childEdge->_parentEdge = nullptr;
+        outEdge->_output->_producerEdge = outEdge->_childEdge;
+
+        IE_ASSERT(outEdge->_ptrPosInModel != _outEdgePtrList.end());
+        _outEdgePtrList.erase(outEdge->_ptrPosInModel);
+    }
+
+    IE_ASSERT(startInd >= 0 && endInd > startInd && startInd <= parentStage->_outputEdges.size());
+    parentStage->_outputEdges.erase(
+        parentStage->_outputEdges.begin() + startInd,
+        parentStage->_outputEdges.begin() + endInd);
+
+    for (int i = 0; i < parentStage->_outputEdges.size(); ++i) {
+        parentStage->_outputEdges[i]->_portInd = i;
+    }
+
+    //
+    // Remove Injected Temp Buffer Edges from parent Stage.
+    //
+
+    startInd = -1;
+    endInd = -1;
+
+    for (const auto& tempBufferEdge : parentStage->_tempBufferEdges) {
+        if (tempBufferEdge->_childEdge == nullptr) {
+            IE_ASSERT(startInd < 0);
+            continue;
+        }
+
+        if (startInd >= 0 && endInd >= 0) {
+            IE_ASSERT(tempBufferEdge->_childEdge->_stage != childStage);
+        }
+
+        if (tempBufferEdge->_childEdge->_stage != childStage) {
+            if (startInd >= 0 && endInd < 0) {
+                endInd = tempBufferEdge->_portInd;
+            }
+            continue;
+        }
+
+        if (startInd < 0) {
+            startInd = tempBufferEdge->_portInd;
+        }
+        if (tempBufferEdge->_portInd == parentStage->_tempBufferEdges.size() - 1) {
+            endInd = tempBufferEdge->_portInd + 1;
+        }
+
+        tempBufferEdge->_childEdge->_parentEdge = nullptr;
+        tempBufferEdge->_tempBuffer->_tempBufferEdge = tempBufferEdge->_childEdge;
+
+        IE_ASSERT(tempBufferEdge->_ptrPosInModel != _tempBufferEdgePtrList.end());
+        _tempBufferEdgePtrList.erase(tempBufferEdge->_ptrPosInModel);
+    }
+
+    if (startInd >= 0) {
+        IE_ASSERT(endInd > startInd && startInd <= parentStage->_tempBufferEdges.size());
+        parentStage->_tempBufferEdges.erase(
+            parentStage->_tempBufferEdges.begin() + startInd,
+            parentStage->_tempBufferEdges.begin() + endInd);
+
+        for (int i = 0; i < parentStage->_tempBufferEdges.size(); ++i) {
+            parentStage->_tempBufferEdges[i]->_portInd = i;
+        }
+    }
+
+    if (parentStage->_prevStages.empty()) {
+        _initialStages.emplace(parentStage);
+    } else {
+        _initialStages.erase(parentStage);
+    }
+
+    if (childStage->_prevStages.empty()) {
+        _initialStages.emplace(childStage);
+    } else {
+        _initialStages.erase(childStage);
+    }
+
+    //
+    // Remove the InjectedStage Edge from the Model.
+    //
+
+    IE_ASSERT(edge->_ptrPosInModel != _stageEdgePtrList.end());
+    _stageEdgePtrList.erase(edge->_ptrPosInModel);
+}
+
+Model::DataEdgeHelper::~DataEdgeHelper() {
+    //
+    // Check that `done` was called.
+    //
+
+    if (_model != nullptr) {
+        std::terminate();
+    }
+}
+
+Model::DataEdgeHelper& Model::DataEdgeHelper::parent(const Data& parent) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `parent` was not called.
+    //
+
+    IE_ASSERT(_parent == nullptr);
+
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(parent->_model == _model);
+
+    _parent = parent;
+
+    return *this;
+}
+
+Model::DataEdgeHelper& Model::DataEdgeHelper::child(const Data& child) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `child` was not called.
+    //
+
+    IE_ASSERT(_child == nullptr);
+
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(child->_model == _model);
+
+    _child = child;
+
+    return *this;
+}
+
+Model::DataEdgeHelper& Model::DataEdgeHelper::mode(SharedDataMode mode) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `mode` was not called.
+    //
+
+    IE_ASSERT(!_modeSet);
+
+    _mode = mode;
+    _modeSet = true;
+
+    return *this;
+}
+
+Model::DataEdgeHelper& Model::DataEdgeHelper::order(SharedDataOrder order) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `order` was not called.
+    //
+
+    IE_ASSERT(!_orderSet);
+
+    _order = order;
+    _orderSet = true;
+
+    return *this;
+}
+
+Model::DataEdgeHelper& Model::DataEdgeHelper::offset(const DimValues& offset) {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that `offset` was not called.
+    //
+
+    IE_ASSERT(!_offsetSet);
+
+    _offset = offset;
+    _offsetSet = true;
+
+    return *this;
+}
+
+SharedAllocation Model::DataEdgeHelper::done() {
+    //
+    // Check that `done` was not called.
+    //
+
+    IE_ASSERT(_model != nullptr);
+
+    //
+    // Check that all fields were set.
+    //
+
+    IE_ASSERT(_parent != nullptr);
+    IE_ASSERT(_child != nullptr);
+    IE_ASSERT(_modeSet);
+    IE_ASSERT(_orderSet);
+
+    AutoScope autoNullModel([&] {
+        _model = nullptr;
+    });
+    //
+    // Call the actual implementation.
+    //
+
+    auto edge = _model->connectDatasImpl(
+        _parent, _child,
+        _mode, _order,
+        _offset);
+
+    //
+    // Reset internal state.
+    //
+
+    _model = nullptr;
+
+    return edge;
+}
+
+SharedAllocation Model::connectDatasImpl(
+        const Data& parent,
+        const Data& child,
+        SharedDataMode mode,
+        SharedDataOrder order,
+        const DimValues& offset) {
+    //
+    // Get producer and consumer data.
+    //
+
+    Data producer, consumer;
+    if (order == SharedDataOrder::ChildWritesToParent) {
+        producer = child;
+        consumer = parent;
+    } else if (order == SharedDataOrder::ParentWritesToChild) {
+        producer = parent;
+        consumer = child;
+    } else {
+        VPU_THROW_EXCEPTION << "Invalid data order " << order;
+    }
+
+    //
+    // Child must be Intermediate.
+    //
+
+    VPU_THROW_UNLESS(child->_usage == DataUsage::Intermediate);
+
+    //
+    // Parent can't be Temp or Fake.
+    //
+
+    VPU_THROW_UNLESS(parent->_usage != DataUsage::Temp && parent->_usage != DataUsage::Fake);
+
+    //
+    // Consumer must be accesible from the producer.
+    //
+
+    Stage connectionStage;
+
+    for (const auto& consumerEdge : producer->_consumerEdges) {
+        for (const auto& outEdge : consumerEdge->_consumer->_outputEdges) {
+            if (outEdge->_output == consumer) {
+                connectionStage = consumerEdge->_consumer;
+                break;
+            }
+        }
+
+        if (connectionStage != nullptr) {
+            break;
+        }
+    }
+
+    IE_ASSERT(connectionStage != nullptr);
+
+    //
+    // Connection stage must be special.
+    //
+
+    VPU_THROW_UNLESS(connectionStage->category() == StageCategory::Special);
+
+    //
+    // Special checks for each mode.
+    //
+
+    if (mode == SharedDataMode::ROI) {
+        //
+        // Check connection stage type and that parent has the largest buffer.
+        //
+
+        if (connectionStage->_type == StageType::Concat ||
+            connectionStage->_type == StageType::Expand) {
+            IE_ASSERT(producer == child);
+            IE_ASSERT(consumer == parent);
+        } else if (connectionStage->_type == StageType::Split ||
+                   connectionStage->_type == StageType::Shrink) {
+            IE_ASSERT(producer == parent);
+            IE_ASSERT(consumer == child);
+        } else {
+            VPU_THROW_EXCEPTION
+                    << "Stage type " << connectionStage->_type
+                    << " can't be used for ROI data connection";
+        }
+
+        //
+        // Parent and child must have the same order.
+        //
+
+        VPU_THROW_UNLESS(parent->desc().dimsOrder() == child->desc().dimsOrder());
+
+        //
+        // Offset must be valid.
+        //
+
+        for (const auto& p : offset) {
+            IE_ASSERT(parent->desc().dimsOrder().hasDim(p.first));
+
+            IE_ASSERT(child->desc().dim(p.first) + p.second <= parent->desc().dim(p.first));
+        }
+
+        //
+        // Check strides requirements
+        //
+
+        IE_ASSERT(checkStrides(child->desc(), parent->strides(), child->_requiredStrides));
+        child->resetRequiredStrides();
+    } else if (mode == SharedDataMode::Reshape) {
+        //
+        // Check connection stage type.
+        //
+
+        IE_ASSERT(connectionStage->_type == StageType::Reshape);
+
+        //
+        // Parent and child must have the same data type.
+        //
+
+        IE_ASSERT(parent->desc().type() == child->desc().type());
+
+        //
+        // Parent and child must have the same number of elements.
+        //
+
+        IE_ASSERT(parent->desc().totalDimSize() == child->desc().totalDimSize());
+
+        //
+        // Parent and child must be compact.
+        //
+
+        // TODO: can we weaken this restriction?
+        IE_ASSERT(parent->checkStrides(StridesRequirement::compact()));
+        IE_ASSERT(child->checkStrides(StridesRequirement::compact()));
+    } else {
+        VPU_THROW_EXCEPTION << "Invalid shared data mode " << mode;
+    }
+
+    //
+    // Remove previous edge if any.
+    //
+
+    auto prevEdge = child->_parentDataEdge;
+
+    if (prevEdge != nullptr) {
+        prevEdge->_parent->_childDataEdges.erase(prevEdge);
+    }
+
+    //
+    // Create new Edge.
+    //
+
+    std::shared_ptr<SharedAllocationEdge> edge(new SharedAllocationEdge);
+
+    edge->_parent = parent;
+    edge->_child = child;
+    edge->_connection = connectionStage;
+    edge->_mode = mode;
+    edge->_order = order;
+    edge->_model = handle_from_this();
+    if (mode == SharedDataMode::ROI) {
+        edge->attrs().set<DimValues>("offset", offset);
+    }
+
+    edge->_ptrPosInModel = _dataEdgePtrList.emplace(_dataEdgePtrList.end(), edge);
+    parent->_childDataEdges.push_back(edge);
+
+    child->_parentDataEdge = edge;
+
+    //
+    // Deallocate previous edge if any.
+    //
+
+    if (prevEdge != nullptr) {
+        IE_ASSERT(prevEdge->_ptrPosInModel != _dataEdgePtrList.end());
+        _dataEdgePtrList.erase(prevEdge->_ptrPosInModel);
+    }
+
+    _allocator.setNeedToAllocNonIntermData();
+
+    return edge;
+}
+
+void Model::disconnectStageDatas(const Stage& stage) {
+    //
+    // Check that objects belong to the same Model.
+    //
+
+    IE_ASSERT(stage->_model.get() == this);
+
+    //
+    // This affect the Stage order.
+    //
+
+    _resetStageOrder = true;;
+
+    //
+    // Disconnect input datas.
+    //
+
+    for (const auto& inEdge : stage->_inputEdges) {
+        if (inEdge->_input->_producerEdge != nullptr) {
+            auto it1 = inEdge->_input->_producerEdge->_producer->_nextStages.find(inEdge->_consumer);
+            IE_ASSERT(it1 != inEdge->_input->_producerEdge->_producer->_nextStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                inEdge->_input->_producerEdge->_producer->_nextStages.erase(it1);
+            }
+
+            auto it2 = inEdge->_consumer->_prevStages.find(inEdge->_input->_producerEdge->_producer);
+            IE_ASSERT(it2 != inEdge->_consumer->_prevStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                inEdge->_consumer->_prevStages.erase(it2);
+            }
+        }
+
+        inEdge->_input->_consumerEdges.erase(inEdge);
+
+        IE_ASSERT(inEdge->_ptrPosInModel != _inEdgePtrList.end());
+        _inEdgePtrList.erase(inEdge->_ptrPosInModel);
+    }
+
+    stage->_inputEdges.clear();
+
+    //
+    // Disconnect output datas.
+    //
+
+    for (const auto& outEdge : stage->_outputEdges) {
+        for (const auto& consumerEdge : outEdge->_output->_consumerEdges) {
+            auto it1 = consumerEdge->_consumer->_prevStages.find(outEdge->_producer);
+            IE_ASSERT(it1 != consumerEdge->_consumer->_prevStages.end());
+            --it1->second;
+            if (it1->second <= 0) {
+                consumerEdge->_consumer->_prevStages.erase(it1);
+            }
+
+            auto it2 = outEdge->_producer->_nextStages.find(consumerEdge->_consumer);
+            IE_ASSERT(it2 != outEdge->_producer->_nextStages.end());
+            --it2->second;
+            if (it2->second <= 0) {
+                outEdge->_producer->_nextStages.erase(it2);
+            }
+        }
+
+        outEdge->_output->_producerEdge = nullptr;
+
+        IE_ASSERT(outEdge->_ptrPosInModel != _outEdgePtrList.end());
+        _outEdgePtrList.erase(outEdge->_ptrPosInModel);
+    }
+
+    stage->_outputEdges.clear();
+
+    //
+    // Disconnect temp datas.
+    //
+
+    for (const auto& tempBufferEdge : stage->_tempBufferEdges) {
+        tempBufferEdge->_tempBuffer->_tempBufferEdge = nullptr;
+
+        IE_ASSERT(tempBufferEdge->_ptrPosInModel != _tempBufferEdgePtrList.end());
+        _tempBufferEdgePtrList.erase(tempBufferEdge->_ptrPosInModel);
+    }
+
+    stage->_tempBufferEdges.clear();
+
+    _initialStages.emplace(stage);
+
+    _allocator.setNeedToAllocNonIntermData();
+}
+
+void Model::removeStage(const Stage& stage) {
+    IE_ASSERT(stage->_model.get() == this);
+
+    _resetStageOrder = true;;
+
+    disconnectStageDatas(stage);
+
+    _initialStages.erase(stage);
+
+    IE_ASSERT(stage->_ptrPosInModel != _stagePtrList.end());
+    _stagePtrList.erase(stage->_ptrPosInModel);
+}
+
+void Model::cleanUpDatas() {
+    bool needAllocatorPreprocess = false;
+
+    for (const auto& data : datas()) {
+        if (data->_usage == DataUsage::Input) {
+            IE_ASSERT(!data->_consumerEdges.empty());
+            IE_ASSERT(data->_parentDataEdge == nullptr);
+        } else if (data->_usage == DataUsage::Output) {
+            IE_ASSERT(data->_producerEdge != nullptr);
+            IE_ASSERT(data->_parentDataEdge == nullptr);
+        } else if (data->_usage == DataUsage::Temp) {
+            if (data->_tempBufferEdge == nullptr) {
+                _dataList.erase(data);
+
+                IE_ASSERT(data->_ptrPosInModel != _dataPtrList.end());
+                _dataPtrList.erase(data->_ptrPosInModel);
+            }
+        } else {
+            if (data->_consumerEdges.empty() && data->_producerEdge == nullptr) {
+                if (data->usage() != DataUsage::Intermediate) {
+                    needAllocatorPreprocess = true;
+                }
+
+                _dataList.erase(data);
+
+                IE_ASSERT(data->_ptrPosInModel != _dataPtrList.end());
+                _dataPtrList.erase(data->_ptrPosInModel);
+            }
+        }
+    }
+
+    if (needAllocatorPreprocess) {
+        _allocator.setNeedToAllocNonIntermData();
+    }
+}
+
+void Model::buildStageOrder(BuildStageOrder order) const {
+    if ((!_resetStageOrder) && (order == _stageOrder)) {
+        IE_ASSERT(_orderedStageList.size() == _stagePtrList.size());
+        return;
+    }
+
+    VPU_PROFILE(buildStageOrder);
+
+    _orderedStageList.clear();
+    _resetStageOrder = false;
+    _stageOrder = order;
+
+    if (_stagePtrList.empty()) {
+        return;
+    }
+
+    //
+    // Run recursive DFS algorithm
+    //
+
+    IE_ASSERT(!_initialStages.empty());
+
+    StageMap<bool> visitedMap;
+    if (order == BuildStageOrder::DFS) {
+        for (const auto& stage : _initialStages) {
+            runDFS(stage, visitedMap);
+        }
+    } else if (order == BuildStageOrder::BFS) {
+        StageList queue(&StageNode::_posInBfsQueue);
+        for (const auto& stage : _initialStages) {
+            queue.push_back(stage);
+            visitedMap[stage] = true;
+        }
+        runBFS(queue, visitedMap);
+    } else {
+        VPU_THROW_EXCEPTION << "Unsupported order " << order;
+    }
+
+    IE_ASSERT(_orderedStageList.size() == _stagePtrList.size());
+
+    int stageInd = 0;
+    for (const auto& stage : _orderedStageList) {
+        stage->_index = stageInd;
+        ++stageInd;
+    }
+}
+
+void Model::runDFS(
+        const Stage& stage,
+        StageMap<bool>& visitedMap) const {
+    IE_ASSERT(stage->_parentStageEdge == nullptr);
+
+    visitedMap[stage] = false;
+
+    for (const auto& nextStage : stage->_nextStages) {
+        IE_ASSERT(nextStage.second > 0);
+
+        auto it = visitedMap.find(nextStage.first);
+
+        if (it != visitedMap.end()) {
+            auto visited = it->second;
+
+            if (!visited) {
+                VPU_THROW_EXCEPTION << "Graph has cycle";
+            }
+
+            continue;
+        }
+
+        runDFS(nextStage.first, visitedMap);
+    }
+
+    visitedMap[stage] = true;
+
+    _orderedStageList.push_front(stage);
+}
+
+void Model::runBFS(
+        StageList& queue,
+        StageMap<bool>& visitedMap) const {
+    while (!queue.empty()) {
+        auto curStage = queue.front();
+        queue.pop_front();
+
+        _orderedStageList.push_back(curStage);
+
+        for (const auto& nextStage : curStage->_nextStages) {
+            auto it = visitedMap.find(nextStage.first);
+
+            if (it != visitedMap.end()) {
+                auto visited = it->second;
+
+                if (!visited) {
+                    VPU_THROW_EXCEPTION << "Graph has cycle";
+                }
+
+                continue;
+            }
+
+            queue.push_back(nextStage.first);
+            visitedMap[nextStage.first] = true;
+        }
+    }
+}
+
+Stage Model::addNewStageImpl(
+    const std::string& name,
+    StageType type,
+    const ie::CNNLayerPtr& origLayer,
+    const DataVector& inputs,
+    const DataVector& outputs,
+    const FuncRef<StagePtr()>& creator) {
+    //
+    // Check that Stage has inputs and outputs.
+    //
+
+    IE_ASSERT(!inputs.empty());
+    IE_ASSERT(!outputs.empty());
+
+    //
+    // Check that Data objects belong to the same Model.
+    //
+
+    for (const auto& input : inputs) {
+        IE_ASSERT(input->_model.get() == this);
+    }
+    for (const auto& output : outputs) {
+        IE_ASSERT(output->_model.get() == this);
+    }
+
+    //
+    // Check that there are no loops.
+    //
+
+    // TODO: more advanced check.
+    for (const auto& output : outputs) {
+        for (const auto& input : inputs) {
+            IE_ASSERT(input != output);
+        }
+    }
+
+    _resetStageOrder = true;;
+
+    auto stage = creator();
+
+    stage->_name = name;
+    stage->_type = type;
+    stage->_origLayer = origLayer;
+    stage->_model = handle_from_this();
+
+    for (const auto& input : inputs) {
+        addStageInput(stage, input);
+    }
+    for (const auto& output : outputs) {
+        addStageOutput(stage, output);
+    }
+
+    stage->_ptrPosInModel = _stagePtrList.emplace(_stagePtrList.end(), stage);
+
+    return stage;
+}
+
+void Model::removeUnusedData(const Data& data) {
+    IE_ASSERT(data->numConsumers() == 0);
+
+    if (data->usage() != DataUsage::Intermediate &&
+        data->usage() != DataUsage::Temp) {
+        _allocator.setNeedToAllocNonIntermData();
+    }
+
+    _dataList.erase(data);
+
+    IE_ASSERT(data->_ptrPosInModel != _dataPtrList.end());
+    _dataPtrList.erase(data->_ptrPosInModel);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/stage.cpp b/inference-engine/src/vpu/graph_transformer/src/model/stage.cpp
new file mode 100644 (file)
index 0000000..6735eab
--- /dev/null
@@ -0,0 +1,356 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/stage.hpp>
+
+#include <queue>
+#include <algorithm>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+void StageNode::setSubGraphNumber(int subGraphNumber) {
+    IE_ASSERT(subGraphNumber >= -1);
+    _subGraphNumber = subGraphNumber;
+}
+
+void StageNode::setNumSHAVEs(int numSHAVEs) {
+    if (_parentStageEdge == nullptr) {
+        //
+        // Check resources assigned to current Model.
+        //
+
+        IE_ASSERT(_model != nullptr);
+
+        auto totalNumSHAVEs = _model->attrs().get<Resources>("resources").numSHAVEs;
+        IE_ASSERT(numSHAVEs <= totalNumSHAVEs);
+    } else {
+        //
+        // Check resources assigned to parent stage.
+        //
+
+        IE_ASSERT(numSHAVEs == _parentStageEdge->parent()->_numSHAVEs);
+    }
+
+    _numSHAVEs =  numSHAVEs;
+
+    //
+    // Propagate SHAVEs to injected children.
+    //
+
+    for (const auto& injectedStageEdge : _injectedStageEdges) {
+        injectedStageEdge->child()->_numSHAVEs = _numSHAVEs;
+    }
+}
+
+DataMap<float> StageNode::propagateScaleFactors(
+        const DataMap<float>& inputScales,
+        ScalePropagationStep step) {
+    //
+    // Stage <-> Stage edges are not allowed here.
+    //
+
+    IE_ASSERT(_parentStageEdge == nullptr);
+    IE_ASSERT(_injectedStageEdges.empty());
+
+    //
+    // Check that `inputScales` is valid.
+    //
+
+    IE_ASSERT(inputScales.size() == _inputEdges.size());
+    for (const auto& inEdge : _inputEdges) {
+        IE_ASSERT(inputScales.count(inEdge->input()) > 0);
+    }
+
+    //
+    // Get result from Stage implementation.
+    //
+
+    auto res = propagateScaleFactorsImpl(inputScales, step);
+
+    //
+    // Check that implementation returned valid map.
+    //
+
+#ifndef NDEBUG
+    IE_ASSERT(res.size() <= (_inputEdges.size() + _outputEdges.size()));
+
+    for (const auto& p : res) {
+        auto it1 = std::find_if(_inputEdges.begin(), _inputEdges.end(), [p](const StageInput& inEdge) {
+            return inEdge->input() == p.first;
+        });
+        auto it2 = std::find_if(_outputEdges.begin(), _outputEdges.end(), [p](const StageOutput& outEdge) {
+            return outEdge->output() == p.first;
+        });
+        IE_ASSERT(it1 != _inputEdges.end() || it2 != _outputEdges.end());
+    }
+
+    for (const auto& outEdge : _outputEdges) {
+        IE_ASSERT(res.count(outEdge->output()) > 0);
+    }
+#endif
+
+    return res;
+}
+
+DataMap<DimsOrder> StageNode::propagateDataOrder() const {
+    //
+    // Get result from Stage implementation.
+    //
+
+    auto res = propagateDataOrderImpl();
+
+    //
+    // Merge with the results from injected Stages.
+    //
+
+    for (const auto& injectedStageEdge : _injectedStageEdges) {
+        auto childRes = injectedStageEdge->child()->propagateDataOrder();
+        res.insert(childRes.begin(), childRes.end());
+    }
+
+    //
+    // Check that implemenation returned valid map.
+    //
+
+#ifndef NDEBUG
+    IE_ASSERT(res.size() <= (_inputEdges.size() + _outputEdges.size()));
+
+    for (const auto& p : res) {
+        auto it1 = std::find_if(_inputEdges.begin(), _inputEdges.end(), [p](const StageInput& inEdge) {
+            return inEdge->input() == p.first;
+        });
+        auto it2 = std::find_if(_outputEdges.begin(), _outputEdges.end(), [p](const StageOutput& outEdge) {
+            return outEdge->output() == p.first;
+        });
+        IE_ASSERT(it1 != _inputEdges.end() || it2 != _outputEdges.end());
+    }
+#endif
+
+    return res;
+}
+
+DataMap<StridesRequirement> StageNode::getDataStridesRequirements() const {
+    //
+    // Get result from Stage implementation.
+    //
+
+    auto res = getDataStridesRequirementsImpl();
+
+    //
+    // Merge with the results from injected Stages.
+    //
+
+    for (const auto& injectedStageEdge : _injectedStageEdges) {
+        auto childRes = injectedStageEdge->child()->getDataStridesRequirements();
+        res.insert(childRes.begin(), childRes.end());
+    }
+
+    //
+    // Check that implemenation returned valid map.
+    //
+
+#ifndef NDEBUG
+    IE_ASSERT(res.size() <= (_inputEdges.size() + _outputEdges.size()));
+
+    for (const auto& p : res) {
+        auto it1 = std::find_if(_inputEdges.begin(), _inputEdges.end(), [p](const StageInput& inEdge) {
+            return inEdge->input() == p.first;
+        });
+        auto it2 = std::find_if(_outputEdges.begin(), _outputEdges.end(), [p](const StageOutput& outEdge) {
+            return outEdge->output() == p.first;
+        });
+        IE_ASSERT(it1 != _inputEdges.end() || it2 != _outputEdges.end());
+    }
+#endif
+
+    return res;
+}
+
+void StageNode::finalizeDataLayout() {
+    //
+    // Stage <-> Stage edges are not allowed here.
+    //
+
+    IE_ASSERT(_parentStageEdge == nullptr);
+    IE_ASSERT(_injectedStageEdges.empty());
+
+    finalizeDataLayoutImpl();
+}
+
+DataMap<BatchSupport> StageNode::getBatchSupportInfo() const {
+    //
+    // Get result from Stage implementation.
+    //
+
+    auto res = getBatchSupportInfoImpl();
+
+    //
+    // Check that implemenation returned valid map.
+    //
+
+#ifndef NDEBUG
+    IE_ASSERT(res.size() <= (_inputEdges.size() + _outputEdges.size()));
+
+    for (const auto& p : res) {
+        auto it1 = std::find_if(_inputEdges.begin(), _inputEdges.end(), [p](const StageInput& inEdge) {
+            return inEdge->input() == p.first;
+        });
+        auto it2 = std::find_if(_outputEdges.begin(), _outputEdges.end(), [p](const StageOutput& outEdge) {
+            return outEdge->output() == p.first;
+        });
+        IE_ASSERT(it1 != _inputEdges.end() || it2 != _outputEdges.end());
+    }
+
+    bool hasSplit = false;
+    for (const auto& inEdge : _inputEdges) {
+        if (inEdge->childEdge() != nullptr) {
+            continue;
+        }
+
+        auto input = inEdge->input();
+
+        auto it = res.find(input);
+        if (it != res.end()) {
+            auto curReq = it->second;
+
+            if (curReq == BatchSupport::Split) {
+                hasSplit = true;
+            } else {
+                IE_ASSERT(curReq == BatchSupport::ReplicateConstContent);
+            }
+        }
+    }
+
+    for (const auto& outEdge : _outputEdges) {
+        if (outEdge->childEdge() != nullptr) {
+            continue;
+        }
+
+        auto it = res.find(outEdge->output());
+        if (hasSplit) {
+            IE_ASSERT(it != res.end());
+
+            auto curReq = it->second;
+            IE_ASSERT(curReq == BatchSupport::Split);
+        } else {
+            IE_ASSERT(it == res.end());
+        }
+    }
+#endif
+
+    //
+    // Merge with the results from injected Stages.
+    //
+
+    //
+    // Do this after the checks, because parent and child Stages might have different requirements.
+    //
+
+    for (const auto& injectedStageEdge : _injectedStageEdges) {
+        auto childRes = injectedStageEdge->child()->getBatchSupportInfo();
+        res.insert(childRes.begin(), childRes.end());
+    }
+
+    return res;
+}
+
+StageSHAVEsRequirements StageNode::getSHAVEsRequirements() const {
+    //
+    // Get result from Stage implementation.
+    //
+
+    // return max for Myriad2
+    auto compileEnv = CompileEnv::get();
+    if (compileEnv.platform == Platform::MYRIAD_2) {
+        return StageSHAVEsRequirements::NeedMax;
+    }
+
+    auto reqs = getSHAVEsRequirementsImpl();
+
+    //
+    // Merge with the results from injected Stages.
+    //
+
+    for (const auto& injectedStageEdge : injectedStageEdges()) {
+        auto childRes = injectedStageEdge->child()->getSHAVEsRequirements();
+
+        auto resVal = static_cast<int>(reqs);
+        auto childResVal = static_cast<int>(childRes);
+
+        reqs = static_cast<StageSHAVEsRequirements>(std::max(resVal, childResVal));
+    }
+
+    return reqs;
+}
+
+void StageNode::finalCheck() const {
+    finalCheckImpl();
+
+    for (const auto& injectedStageEdge : injectedStageEdges()) {
+        injectedStageEdge->child()->finalCheck();
+    }
+}
+
+void StageNode::serialize(BlobSerializer& serializer) const {
+    // Check that we don't serialize Special stage.
+    IE_ASSERT(category() != StageCategory::Special);
+
+    mv_stage_header stageHdr = {
+        checked_cast<uint32_t>(0u),
+        checked_cast<uint32_t>(_type),
+        checked_cast<uint32_t>(_numSHAVEs)
+    };
+
+    auto stageHeaderPos = serializer.append(stageHdr);
+
+    auto paramsPos = serializer.append(static_cast<uint32_t>(0));
+    serializeParamsImpl(serializer);
+    serializer.overWriteTailSize(paramsPos);
+
+    serializeDataImpl(serializer);
+
+    serializer.append(stageHdr.stage_type);
+    serializer.append(STAGE_BORDER_SYMBOL);
+
+    serializer.overWriteTailSize(stageHeaderPos);
+}
+
+DataMap<float> StageNode::propagateScaleFactorsImpl(
+        const DataMap<float>&,
+        ScalePropagationStep) {
+    //
+    // Default implementation assumes no scaling support.
+    //
+
+    DataMap<float> out;
+
+    for (const auto& inEdge : _inputEdges) {
+        out[inEdge->input()] = 1.0f;
+    }
+    for (const auto& outEdge : _outputEdges) {
+        out[outEdge->output()] = 1.0f;
+    }
+
+    return out;
+}
+
+StageSHAVEsRequirements StageNode::getSHAVEsRequirementsImpl() const {
+    if (category() == StageCategory::SHAVE) {
+        return StageSHAVEsRequirements::NeedMax;
+    } else {
+        return StageSHAVEsRequirements::NotNeeded;
+    }
+}
+
+void printTo(std::ostream& os, const Stage& stage) {
+    os << (stage == nullptr ? "<null>" : stage->name());
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/network_config.cpp b/inference-engine/src/vpu/graph_transformer/src/network_config.cpp
new file mode 100644 (file)
index 0000000..40cf567
--- /dev/null
@@ -0,0 +1,337 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/network_config.hpp>
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <map>
+#include <unordered_map>
+
+#include <pugixml.hpp>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/string.hpp>
+
+namespace vpu {
+
+namespace {
+
+template <class Set>
+void parseStringSet(const std::string& str, Set& set) {
+    splitStringList(str, set, ',');
+}
+
+std::vector<std::string> splitString(const std::string& str, char sep) {
+    std::vector<std::string> out;
+    splitStringList(str, out, sep);
+    return out;
+}
+
+template <typename K, typename V>
+V getValFromMap(const std::unordered_map<K, V>& map, const K& key, const V& def) {
+    auto it = map.find(key);
+    if (it == map.end())
+        return def;
+    return it->second;
+}
+
+template <typename K, typename V>
+const V* getValFromMap(const std::unordered_map<K, V>& map, const K& key) {
+    auto it = map.find(key);
+    if (it == map.end())
+        return nullptr;
+    return &it->second;
+}
+
+template <typename K1, typename K2, typename V>
+const V* getValFromMap(const std::unordered_map<K1, std::unordered_map<K2, V>>& map,
+                       const K1& key1, const K2& key2) {
+    auto it1 = map.find(key1);
+    if (it1 == map.end())
+        return nullptr;
+
+    auto it2 = it1->second.find(key2);
+    if (it2 == it1->second.end())
+        return nullptr;
+
+    return &it2->second;
+}
+
+float parseScale(const std::string& val) {
+    float scale = 0.0f;
+    try {
+        scale = std::stof(val);
+    } catch (...) {
+        VPU_THROW_EXCEPTION
+            << "Invalid scale value " << val;
+    }
+
+    if (scale <= 0.0f) {
+        VPU_THROW_EXCEPTION
+            << "Invalid scale value " << scale;
+    }
+
+    return scale;
+}
+
+template <typename K> K xmlAttrToVal(const pugi::xml_attribute& attr);
+template<> std::string xmlAttrToVal<std::string>(const pugi::xml_attribute& attr) {
+    return attr.as_string("");
+}
+template<> int xmlAttrToVal<int>(const pugi::xml_attribute& attr) {
+    return attr.as_int(0);
+}
+
+template <typename K>
+std::map<K, pugi::xml_node> xmlCollectChilds(const pugi::xml_node& xmlParent,
+                                             const char* childName,
+                                             const char* keyName) {
+    std::map<K, pugi::xml_node> out;
+
+    for (auto xmlChild = xmlParent.child(childName);
+         !xmlChild.empty();
+         xmlChild = xmlChild.next_sibling(childName)) {
+        auto xmlKey = xmlChild.attribute(keyName);
+        if (xmlKey.empty()) {
+            VPU_THROW_EXCEPTION << "Missing " << keyName << " attribute in " << childName;
+        }
+
+        auto key = xmlAttrToVal<K>(xmlKey);
+
+        if (out.count(key) != 0) {
+            VPU_THROW_EXCEPTION
+                << "" << xmlParent.name() << " already has " << childName
+                << " with " << keyName << " " << key;
+        }
+
+        out[key] = xmlChild;
+    }
+
+    return out;
+}
+
+template <typename T> T parseVal(const std::string& val);
+template <> bool parseVal<bool>(const std::string& val) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    if (cmp(val, "true"))
+        return true;
+    if (cmp(val, "false"))
+        return false;
+
+    VPU_THROW_EXCEPTION << "Invalid bool value " << val;
+}
+template <> float parseVal<float>(const std::string& val) {
+    return parseScale(val);
+}
+template <> int parseVal<int>(const std::string& val) {
+    int size = 0;
+    try {
+        size = std::stoi(val);
+    } catch (...) {
+        VPU_THROW_EXCEPTION
+            << "Invalid integer value " << val;
+    }
+
+    if (size <= 0) {
+        VPU_THROW_EXCEPTION
+            << "Invalid integer value " << val;
+    }
+
+    return size;
+}
+
+template <typename K, typename V>
+void xmlUpdateMap(const pugi::xml_node& xmlNode,
+                  std::unordered_map<K, V>& map,
+                  const K& key) {
+    if (xmlNode.empty())
+        return;
+
+    map[key] = parseVal<V>(xmlNode.child_value());
+}
+
+}  // namespace
+
+bool NetworkConfig::skipAllLayers() const {
+    if (_noneLayers.size() == 1) {
+        auto val = *_noneLayers.begin();
+        return val == "*";
+    }
+    return false;
+}
+
+bool NetworkConfig::hwDisabled(const std::string& layerName) const {
+    if (!_hwWhiteList.empty()) {
+        return _hwWhiteList.count(layerName) == 0;
+    }
+
+    if (!_hwBlackList.empty()) {
+        return _hwBlackList.count(layerName) != 0;
+    }
+
+    return false;
+}
+
+void NetworkConfig::parse(const CompilationConfig& config) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    parseStringSet(config.noneLayers, _noneLayers);
+    parseStringSet(config.hwWhiteList, _hwWhiteList);
+    parseStringSet(config.hwBlackList, _hwBlackList);
+
+    if (config.networkConfig.empty())
+        return;
+
+    auto props = splitString(config.networkConfig, ',');
+
+    std::string configFileName;
+    std::string curDataName;
+    bool hasOption = false;
+
+    auto checkChildOption = [&curDataName, &hasOption]() {
+        if (!curDataName.empty() && !hasOption) {
+            VPU_THROW_EXCEPTION
+                << "Incorrect VPU_NETWORK_CONFIG option : "
+                << "data " << curDataName << " doesn't have any option";
+        }
+    };
+
+    for (const auto& prop : props) {
+        auto propTokens = splitString(prop, '=');
+        if (propTokens.size() != 2) {
+            VPU_THROW_EXCEPTION
+                << "Incorrect VPU_NETWORK_CONFIG option : "
+                << "it should be <key>=<value> list separated by `,`";
+        }
+
+        auto propName = propTokens[0];
+        auto propValue = propTokens[1];
+
+        if (propName == "file") {
+            if (!configFileName.empty()) {
+                VPU_THROW_EXCEPTION
+                    << "Incorrect VPU_NETWORK_CONFIG option : "
+                    << "can't use `file` key more than once";
+            }
+
+            checkChildOption();
+
+            configFileName = propValue;
+
+            continue;
+        }
+
+        if (propName == "data") {
+            checkChildOption();
+
+            curDataName = propValue;
+            hasOption = false;
+
+            continue;
+        }
+
+        if (propName == "scale") {
+            if (curDataName.empty()) {
+                VPU_THROW_EXCEPTION
+                    << "Incorrect VPU_NETWORK_CONFIG option : "
+                    << "missing data name for scale parameter";
+            }
+
+            if (_dataScale.count(curDataName) != 0) {
+                VPU_THROW_EXCEPTION
+                    << "Incorrect VPU_NETWORK_CONFIG option : "
+                    << "data " << curDataName << " already have scale factor";
+            }
+
+            _dataScale[curDataName] = parseScale(propValue);
+
+            curDataName = "";
+            hasOption = false;
+
+            continue;
+        }
+
+        VPU_THROW_EXCEPTION
+            << "Incorrect VPU_NETWORK_CONFIG option : "
+            << "unknown option " << propName;
+    }
+
+    checkChildOption();
+
+    if (configFileName.empty())
+        return;
+
+    pugi::xml_document xmlDoc;
+    auto xmlRes = xmlDoc.load_file(configFileName.c_str());
+    if (xmlRes.status != pugi::status_ok) {
+        std::ifstream file(configFileName);
+        if (!file.is_open()) {
+            VPU_THROW_EXCEPTION << "Can't open network config file " << configFileName;
+        }
+
+        std::string str((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+        size_t line = 1;
+        size_t pos = 0;
+        for (auto token : str) {
+            if (token != '\n') {
+                pos++;
+            } else {
+                line++;
+                pos = 0;
+            }
+
+            if (pos >= xmlRes.offset)
+                break;
+        }
+
+        VPU_THROW_EXCEPTION
+            << "Error loading XML file: " << configFileName
+            << ", " << xmlRes.description()
+            << " at line: " << line << " pos: " << pos;
+    }
+
+    auto xmlRoot = xmlDoc.document_element();
+    std::string docName(xmlRoot.name());
+    if (docName != "vpu_net_config") {
+        VPU_THROW_EXCEPTION
+            << "Invalid network config file " << configFileName
+            << " : is is not a VPU network config";
+    }
+
+    auto docVersion = xmlRoot.attribute("version").as_int(0);
+    if (docVersion != 1) {
+        VPU_THROW_EXCEPTION
+            << "Invalid network config file " << configFileName
+            << " : unsupported version " << docVersion;
+    }
+
+    auto datas = xmlCollectChilds<std::string>(xmlRoot.child("data"), "data", "name");
+    auto layers = xmlCollectChilds<std::string>(xmlRoot.child("layers"), "layer", "name");
+
+    for (const auto& dataInfo : datas) {
+        const auto& dataName = dataInfo.first;
+        const auto& xmlData = dataInfo.second;
+
+        xmlUpdateMap(xmlData.child("scale"), _dataScale, dataName);
+    }
+
+    for (const auto& layerInfo : layers) {
+        const auto& layerName = layerInfo.first;
+        const auto& xmlLayer = layerInfo.second;
+
+        if (auto xmlHw = xmlLayer.child("hw")) {
+            if (auto xmlEnable = xmlHw.child("enable")) {
+                if (!parseVal<bool>(xmlEnable.child_value()))
+                    _hwBlackList.insert(layerName);
+            }
+        }
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/parsed_config.cpp b/inference-engine/src/vpu/graph_transformer/src/parsed_config.cpp
new file mode 100644 (file)
index 0000000..4509d85
--- /dev/null
@@ -0,0 +1,296 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/parsed_config.hpp>
+
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <sstream>
+#include <string>
+#include <memory>
+#include <map>
+
+#include <cpp_interfaces/exception2status.hpp>
+#include <details/caseless.hpp>
+#include <ie_plugin_config.hpp>
+
+namespace vpu {
+
+namespace  {
+
+template<typename I, typename T, typename C>
+void check_input(const I &input, const T &options, const C &check) {
+    for (auto &&option : options) {
+        auto input_entry = input.find(option.first);
+        if (input_entry == input.end()) {
+            continue;
+        }
+
+        auto input_key = input_entry->first;
+        auto input_val = input_entry->second;
+        auto values = option.second;
+
+        if (!check(values, input_val)) {
+            THROW_IE_EXCEPTION << "Incorrect value " << "\"" << input_val << "\"" << " for key " << input_key;
+        }
+    }
+}
+
+}  // namespace
+
+ParsedConfig::ParsedConfig(ConfigMode configMode): _mode(configMode) {
+    _log = std::make_shared<Logger>("Config", LogLevel::Warning, consoleOutput());
+}
+
+void ParsedConfig::checkSupportedValues(
+    const std::unordered_map<std::string, std::unordered_set<std::string>> &supported,
+    const std::map<std::string, std::string> &config) const {
+
+    auto contains = [](const std::unordered_set<std::string> &supported, const std::string &option) {
+        return supported.find(option) != supported.end();
+    };
+
+    check_input(config, supported, contains);
+}
+
+void ParsedConfig::checkInvalidValues(const std::map<std::string, std::string> &config) const {
+    const std::unordered_map<std::string, std::unordered_set<std::string>> supported_values = {
+        { CONFIG_KEY(LOG_LEVEL),
+          { CONFIG_VALUE(LOG_NONE), CONFIG_VALUE(LOG_WARNING), CONFIG_VALUE(LOG_INFO), CONFIG_VALUE(LOG_DEBUG) }},
+        { VPU_CONFIG_KEY(LOG_LEVEL),
+          { CONFIG_VALUE(LOG_NONE), CONFIG_VALUE(LOG_WARNING), CONFIG_VALUE(LOG_INFO), CONFIG_VALUE(LOG_DEBUG) }},
+        { VPU_CONFIG_KEY(COMPUTE_LAYOUT),
+            { VPU_CONFIG_VALUE(AUTO), VPU_CONFIG_VALUE(NCHW), VPU_CONFIG_VALUE(NHWC) }},
+        { VPU_CONFIG_KEY(COPY_OPTIMIZATION),      { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(PACK_DATA_IN_CMX),      { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(IGNORE_UNKNOWN_LAYERS),  { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { CONFIG_KEY(PERF_COUNT),                 { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),   { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION), { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(FORCE_RESET),            { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(HW_ADAPTIVE_MODE),       { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(ALLOW_FP32_MODELS),      { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(HW_INJECT_STAGES),       { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(HW_POOL_CONV_MERGE),     { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+        { VPU_CONFIG_KEY(PERF_REPORT_MODE),
+            { VPU_CONFIG_VALUE(PER_LAYER), VPU_CONFIG_VALUE(PER_STAGE) }},
+        { VPU_CONFIG_KEY(IGNORE_IR_STATISTIC),    { CONFIG_VALUE(YES), CONFIG_VALUE(NO) }},
+    };
+
+    checkSupportedValues(supported_values, config);
+
+    auto config_norm = config.find(VPU_CONFIG_KEY(INPUT_NORM));
+    if (config_norm != config.end()) {
+        std::map<std::string, float> configFloat = {{VPU_CONFIG_KEY(INPUT_NORM), std::stof(config_norm->second)}};
+
+        const std::unordered_map<std::string, std::unordered_set<float>> unsupported_values = {
+            { VPU_CONFIG_KEY(INPUT_NORM), { 0.0f } }
+        };
+
+        auto doesNotContain = [](const std::unordered_set<float> &unsupported, float option) {
+            return unsupported.find(option) == unsupported.end();
+        };
+        check_input(configFloat, unsupported_values, doesNotContain);
+    }
+
+    auto number_of_shaves = config.find(VPU_CONFIG_KEY(NUMBER_OF_SHAVES));
+    auto number_of_CMX = config.find(VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES));
+
+    if (number_of_shaves != config.end()) {
+        try {
+            std::stoi(number_of_shaves->second);
+        }
+        catch(...) {
+            THROW_IE_EXCEPTION << "Invalid config value for VPU_NUMBER_OF_SHAVES, can't cast to unsigned int";
+        }
+    }
+
+    if (number_of_CMX != config.end()) {
+        try {
+            std::stoi(number_of_CMX->second);
+        }
+        catch(...) {
+            THROW_IE_EXCEPTION << "Invalid config value for VPU_NUMBER_OF_CMX_SLICES, can't cast to unsigned int";
+        }
+    }
+
+    if ((number_of_shaves != config.end()) && (number_of_CMX == config.end())) {
+        THROW_IE_EXCEPTION << "You should set both option for resourse management: VPU_NUMBER_OF_CMX_SLICES and VPU_NUMBER_OF_SHAVES";
+    }
+
+    if ((number_of_shaves == config.end()) && (number_of_CMX != config.end())) {
+        THROW_IE_EXCEPTION << "You should set both option for resourse management: VPU_NUMBER_OF_CMX_SLICES and VPU_NUMBER_OF_SHAVES";
+    }
+}
+
+void ParsedConfig::checkUnknownOptions(const std::map<std::string, std::string> &config) const {
+    auto knownOptions = getKnownOptions();
+    for (auto &&entry : config) {
+        if (knownOptions.find(entry.first) == knownOptions.end()) {
+            THROW_IE_EXCEPTION << NOT_FOUND_str << entry.first << " key is not supported for VPU";
+        }
+    }
+}
+
+void ParsedConfig::checkOptionsAccordingToMode(const std::map<std::string, std::string> &config) const {
+    auto compileOptions = getCompileOptions();
+    for (auto &&entry : config) {
+        std::stringstream errorMsgStream;
+        if (compileOptions.find(entry.first) != compileOptions.end() && _mode == ConfigMode::RUNTIME_MODE) {
+            _log->warning("%s option will be ignored. Seems you are using compiled graph", entry.first);
+        }
+    }
+}
+
+std::unordered_set<std::string> ParsedConfig::getCompileOptions() const {
+    return {
+        VPU_CONFIG_KEY(COMPUTE_LAYOUT),
+        VPU_CONFIG_KEY(NETWORK_CONFIG),
+        VPU_CONFIG_KEY(HW_ADAPTIVE_MODE),
+        VPU_CONFIG_KEY(ALLOW_FP32_MODELS),
+        VPU_CONFIG_KEY(COPY_OPTIMIZATION),
+        VPU_CONFIG_KEY(PACK_DATA_IN_CMX),
+        VPU_CONFIG_KEY(DETECT_NETWORK_BATCH),
+        VPU_CONFIG_KEY(INPUT_NORM),
+        VPU_CONFIG_KEY(INPUT_BIAS),
+        VPU_CONFIG_KEY(IGNORE_UNKNOWN_LAYERS),
+        VPU_CONFIG_KEY(NONE_LAYERS),
+        VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION),
+        VPU_CONFIG_KEY(HW_WHITE_LIST),
+        VPU_CONFIG_KEY(HW_BLACK_LIST),
+        VPU_CONFIG_KEY(CUSTOM_LAYERS),
+        VPU_CONFIG_KEY(NUMBER_OF_SHAVES),
+        VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES),
+        VPU_CONFIG_KEY(HW_INJECT_STAGES),
+        VPU_CONFIG_KEY(HW_POOL_CONV_MERGE),
+        VPU_CONFIG_KEY(IGNORE_IR_STATISTIC),
+    };
+}
+
+std::unordered_set<std::string> ParsedConfig::getRuntimeOptions() const {
+    return {
+        CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
+        CONFIG_KEY(LOG_LEVEL),
+        VPU_CONFIG_KEY(LOG_LEVEL),
+        CONFIG_KEY(PERF_COUNT),
+        VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME),
+        CONFIG_KEY(CONFIG_FILE),
+        VPU_CONFIG_KEY(FORCE_RESET),
+        VPU_CONFIG_KEY(PERF_REPORT_MODE),
+    };
+}
+
+std::unordered_set<std::string> ParsedConfig::getKnownOptions() const {
+    std::unordered_set<std::string> knownOptions;
+    auto compileOptions = getCompileOptions();
+    knownOptions.insert(compileOptions.begin(), compileOptions.end());
+
+    auto runtimeOptions = getRuntimeOptions();
+    knownOptions.insert(runtimeOptions.begin(), runtimeOptions.end());
+
+    return knownOptions;
+}
+
+std::map<std::string, std::string> ParsedConfig::getDefaultConfig() const {
+    return {{VPU_CONFIG_KEY(COMPUTE_LAYOUT),            VPU_CONFIG_VALUE(AUTO)},
+            {VPU_CONFIG_KEY(HW_ADAPTIVE_MODE),          CONFIG_VALUE(YES)},
+            {VPU_CONFIG_KEY(ALLOW_FP32_MODELS),         CONFIG_VALUE(NO)},
+            {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),      CONFIG_VALUE(NO)},
+     //     {VPU_CONFIG_KEY(COPY_OPTIMIZATION),         CONFIG_VALUE(YES)},
+     //     {VPU_CONFIG_KEY(PACK_DATA_IN_CMX),         CONFIG_VALUE(YES)},
+            {CONFIG_KEY(LOG_LEVEL),                     CONFIG_VALUE(LOG_NONE)},
+            {VPU_CONFIG_KEY(LOG_LEVEL),                 CONFIG_VALUE(LOG_NONE)},
+            {VPU_CONFIG_KEY(DETECT_NETWORK_BATCH),      CONFIG_VALUE(YES)},
+     //     {VPU_CONFIG_KEY(HW_INJECT_STAGES),          CONFIG_VALUE(YES)},
+            {VPU_CONFIG_KEY(HW_POOL_CONV_MERGE),        CONFIG_VALUE(YES)},
+            // myriad plugin ignore this key, it measures performance always,
+            // added just to pass behavior tests
+            {CONFIG_KEY(PERF_COUNT),                    CONFIG_VALUE(NO)},
+            {VPU_CONFIG_KEY(INPUT_NORM),                "1.0"},
+            {VPU_CONFIG_KEY(INPUT_BIAS),                "0.0"},
+            {VPU_CONFIG_KEY(IGNORE_UNKNOWN_LAYERS),     CONFIG_VALUE(NO)},
+            {VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION),    CONFIG_VALUE(YES)},
+            {VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), CONFIG_VALUE(NO)},
+            // Either, or for customLayers xml
+            // TODO: this option is useless in the case of the HDDL plugin
+            {VPU_CONFIG_KEY(FORCE_RESET),               CONFIG_VALUE(YES)},
+            // TODO: switch to PER_LAYER?
+            {VPU_CONFIG_KEY(PERF_REPORT_MODE),          VPU_CONFIG_VALUE(PER_STAGE)},
+            {VPU_CONFIG_KEY(IGNORE_IR_STATISTIC),       CONFIG_VALUE(NO)},
+    };
+}
+
+void ParsedConfig::configure(const std::map<std::string, std::string> &config) {
+    static const std::unordered_map<std::string, ComputeLayout> layouts {
+        { VPU_CONFIG_VALUE(AUTO), ComputeLayout::AUTO },
+        { VPU_CONFIG_VALUE(NCHW), ComputeLayout::NCHW },
+        { VPU_CONFIG_VALUE(NHWC), ComputeLayout::NHWC },
+    };
+
+    setOption(compileConfig.forceLayout, layouts, config, VPU_CONFIG_KEY(COMPUTE_LAYOUT));
+
+    static const std::unordered_map<std::string, bool> switches = {
+        { CONFIG_VALUE(YES), true },
+        { CONFIG_VALUE(NO), false }
+    };
+
+    setOption(compileConfig.detectBatch,         switches, config, VPU_CONFIG_KEY(DETECT_NETWORK_BATCH));
+    setOption(compileConfig.copyOptimization,    switches, config, VPU_CONFIG_KEY(COPY_OPTIMIZATION));
+    setOption(compileConfig.packDataInCmx,    switches, config, VPU_CONFIG_KEY(PACK_DATA_IN_CMX));
+    setOption(compileConfig.ignoreUnknownLayers, switches, config, VPU_CONFIG_KEY(IGNORE_UNKNOWN_LAYERS));
+    setOption(compileConfig.hwOptimization,      switches, config, VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION));
+    setOption(compileConfig.hwAdaptiveMode,      switches, config, VPU_CONFIG_KEY(HW_ADAPTIVE_MODE));
+    setOption(compileConfig.allowFP32Models,     switches, config, VPU_CONFIG_KEY(ALLOW_FP32_MODELS));
+    setOption(compileConfig.injectSwOps,         switches, config, VPU_CONFIG_KEY(HW_INJECT_STAGES));
+    setOption(compileConfig.mergeHwPoolToConv,   switches, config, VPU_CONFIG_KEY(HW_POOL_CONV_MERGE));
+    setOption(compileConfig.ignoreIRStatistic,   switches, config, VPU_CONFIG_KEY(IGNORE_IR_STATISTIC));
+
+    setOption(compileConfig.noneLayers,    config, VPU_CONFIG_KEY(NONE_LAYERS));
+    setOption(compileConfig.hwWhiteList,   config, VPU_CONFIG_KEY(HW_WHITE_LIST));
+    setOption(compileConfig.hwBlackList,   config, VPU_CONFIG_KEY(HW_BLACK_LIST));
+    setOption(compileConfig.networkConfig, config, VPU_CONFIG_KEY(NETWORK_CONFIG));
+
+    /* priority is set to VPU configuration file over plug-in config */
+    setOption(compileConfig.customLayers, config, VPU_CONFIG_KEY(CUSTOM_LAYERS));
+    if (compileConfig.customLayers.empty()) {
+        setOption(compileConfig.customLayers, config, CONFIG_KEY(CONFIG_FILE));
+    }
+
+    setOption(compileConfig.inputScale, config, VPU_CONFIG_KEY(INPUT_NORM),
+              [](const std::string &src) { return 1.f / std::stof(src); });
+
+    setOption(compileConfig.inputBias, config, VPU_CONFIG_KEY(INPUT_BIAS),
+              [](const std::string &src) { return std::stof(src); });
+
+    setOption(compileConfig.numSHAVEs, config, VPU_CONFIG_KEY(NUMBER_OF_SHAVES),
+              [](const std::string &src) { return std::stoi(src); });
+
+    setOption(compileConfig.numCMXSlices, config, VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES),
+              [](const std::string &src) { return std::stoi(src); });
+
+    setOption(exclusiveAsyncRequests, switches, config, CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS));
+    setOption(printReceiveTensorTime, switches, config, VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME));
+    setOption(perfCount,              switches, config, CONFIG_KEY(PERF_COUNT));
+    setOption(forceReset,             switches, config, VPU_CONFIG_KEY(FORCE_RESET));
+
+    static const std::unordered_map<std::string, LogLevel> logLevels = {
+        { CONFIG_VALUE(LOG_NONE), LogLevel::None },
+        { CONFIG_VALUE(LOG_WARNING), LogLevel::Warning },
+        { CONFIG_VALUE(LOG_INFO), LogLevel::Info },
+        { CONFIG_VALUE(LOG_DEBUG), LogLevel::Debug }
+    };
+
+    setOption(logLevel,    logLevels, config, CONFIG_KEY(LOG_LEVEL));
+    setOption(vpuLogLevel, logLevels, config, VPU_CONFIG_KEY(LOG_LEVEL));
+
+    static const std::unordered_map<std::string, PerfReport> perfReports {
+        { VPU_CONFIG_VALUE(PER_LAYER), PerfReport::PerLayer },
+        { VPU_CONFIG_VALUE(PER_STAGE), PerfReport::PerStage },
+    };
+
+    setOption(perfReport, perfReports, config, VPU_CONFIG_KEY(PERF_REPORT_MODE));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/pass_manager.cpp b/inference-engine/src/vpu/graph_transformer/src/pass_manager.cpp
new file mode 100644 (file)
index 0000000..1df00d8
--- /dev/null
@@ -0,0 +1,263 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <sstream>
+#include <iomanip>
+#include <memory>
+#include <string>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+//
+// PerStagePass
+//
+
+void PerStagePass::run(const Model::Ptr& model) {
+    for (const auto& stage : model->getStages()) {
+        if (_types.count(stage->type()) == 0) {
+            continue;
+        }
+
+        runForStage(model, stage);
+    }
+}
+
+//
+// PassSet
+//
+
+void PassSet::run(const Model::Ptr& model) const {
+    using MilliSecondsFP64 = std::chrono::duration<double, std::milli>;
+
+    const auto& env = CompileEnv::get();
+
+    env.log->debug("Run passes");
+    VPU_LOGGER_SECTION(env.log);
+
+    for (const auto& pass : _passes) {
+        auto pass_ind = &pass - &_passes.front();
+        auto pass_start_time = std::chrono::high_resolution_clock::now();
+
+        model->cleanUpDatas();
+        pass->run(model);
+
+        auto pass_end_time = std::chrono::high_resolution_clock::now();
+
+        env.log->debug(
+            "[PASS %m%d / %d] duration : %f ms",
+            std::setw(2), pass_ind + 1, _passes.size(),
+            std::chrono::duration_cast<MilliSecondsFP64>(pass_end_time - pass_start_time).count());
+    }
+
+    model->cleanUpDatas();
+}
+
+//
+// PassManager
+//
+
+PassSet::Ptr PassManager::buildMiddleEnd() {
+    const auto& env = CompileEnv::get();
+
+    auto passes = std::make_shared<PassSet>();
+
+    //
+    // Initial state
+    //
+
+    _dumpInd = 0;
+    passes->addPass(dumpModel("initial"));
+
+    //
+    // To overcome fp16 limitations
+    //
+
+    if (env.config.hwOptimization) {
+        if (env.config.hwAdaptiveMode) {
+            passes->addPass(analyzeWeightableLayers());
+        } else {
+            if (!env.netConfig.hasManualDataScale()) {
+                passes->addPass(estimateSingleNetworkScale());
+            }
+
+            passes->addPass(propagateDataScale());
+        }
+
+        passes->addPass(dumpModel("dataScaling"));
+    }
+
+    passes->addPass(findSubGraphs());
+    passes->addPass(dumpModel("findSubGraphs"));
+
+    //
+    // Model common adaptation
+    //
+
+    passes->addPass(splitGroupedConv());
+    passes->addPass(dumpModel("splitGroupedConv"));
+
+    //
+    // Model HW-specific optimizations
+    //
+
+    if (env.config.hwOptimization) {
+        passes->addPass(replaceFCbyConv());
+        passes->addPass(dumpModel("replaceFCbyConv"));
+
+        passes->addPass(replaceDeconvByConv());
+        passes->addPass(dumpModel("replaceDeconvByConv"));
+
+        passes->addPass(swapConcatAndHwOps());
+        passes->addPass(dumpModel("swapConcatAndHwOps"));
+
+        passes->addPass(mergeHwStages());
+        passes->addPass(dumpModel("mergeHwStages"));
+
+        passes->addPass(splitHwDepthConv());
+        passes->addPass(dumpModel("splitHwDepthConv"));
+
+        passes->addPass(splitHwConvAndPool());
+        passes->addPass(dumpModel("splitHwConvAndPool"));
+    }
+
+    passes->addPass(hwPadding());
+    passes->addPass(dumpModel("hwPadding"));
+
+    //
+    // Batch support
+    //
+
+    passes->addPass(adjustDataBatch());
+    passes->addPass(dumpModel("adjustDataBatch"));
+
+    //
+    // HW stages tiling
+    //
+
+    if (env.config.hwOptimization) {
+        passes->addPass(hwConvTiling());
+        passes->addPass(hwPoolTiling());
+        passes->addPass(hwFullyConnectedTiling());
+        passes->addPass(dumpModel("hwTiling"));
+    }
+
+    //
+    // Model SW-specific adaptation
+    //
+
+    passes->addPass(swConvAdaptation());
+    passes->addPass(swDeconvAdaptation());
+    passes->addPass(swPoolAdaptation());
+    passes->addPass(swFullyConnectedAdaptation());
+    passes->addPass(dumpModel("swAdaptation"));
+
+    //
+    // Model SW-specific optimizations
+    //
+
+    passes->addPass(mergeReLUAndBias());
+    passes->addPass(dumpModel("mergeReLUAndBias"));
+
+    //
+    // Data layout adjustment
+    //
+
+    passes->addPass(adjustDataLayout());
+    passes->addPass(dumpModel("adjustDataLayout"));
+
+    //
+    // Model special stages processing
+    //
+
+    passes->addPass(processSpecialStages());
+    passes->addPass(dumpModel("processSpecialStages"));
+
+    //
+    // Data location adjustment
+    //
+
+    passes->addPass(adjustDataLocation());
+    passes->addPass(dumpModel("adjustDataLocation"));
+
+    //
+    // Model common optimizations
+    //
+
+    if (env.config.copyOptimization.getOrDefault(true)) {
+        passes->addPass(eliminateCopyStages());
+        passes->addPass(dumpModel("eliminateCopyStages"));
+    }
+
+    //
+    // HW/SW injection
+    //
+
+    if (env.config.hwOptimization && env.config.injectSwOps.getOrDefault(true)) {
+        passes->addPass(injectSw());
+        passes->addPass(dumpModel("injectSw"));
+    }
+
+    //
+    // Final resource allocation
+    //
+
+    passes->addPass(allocateResources());
+    passes->addPass(dumpModel("allocateResources"));
+
+    //
+    // HW stages finalization
+    //
+
+    if (env.config.hwOptimization) {
+        passes->addPass(finalizeHwOps());
+        passes->addPass(dumpModel("hwFinalization"));
+    }
+
+    //
+    // Final check
+    //
+
+    passes->addPass(finalCheck());
+
+    return passes;
+}
+
+//
+// DumpPass
+//
+
+namespace {
+
+class DumpPass final : public Pass {
+public:
+    DumpPass(const std::string& postfix,
+             const BackEnd::Ptr& backEnd) :
+            _postfix(postfix), _backEnd(backEnd) {
+    }
+
+    void run(const Model::Ptr& model) override {
+        _backEnd->dumpModel(model, _postfix);
+    }
+
+private:
+    std::string _postfix;
+    BackEnd::Ptr _backEnd;
+};
+
+}  // namespace
+
+Pass::Ptr PassManager::dumpModel(const std::string& postfix) {
+    std::ostringstream ostr;
+    ostr << std::setw(2) << std::setfill('0') << _dumpInd << "-" << postfix;
+
+    ++_dumpInd;
+
+    return std::make_shared<DumpPass>(ostr.str(), _backEnd);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_batch.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_batch.cpp
new file mode 100644 (file)
index 0000000..10b6f62
--- /dev/null
@@ -0,0 +1,241 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <cmath>
+#include <list>
+#include <set>
+#include <unordered_map>
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+using ReplicatedDataMap = std::unordered_map<int, Data>;
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(adjustDataBatch);
+
+    for (const auto& stage : model->getStages()) {
+        //
+        // Get stage information
+        //
+
+        auto stageInfo = stage->getBatchSupportInfo();
+
+        if (stageInfo.empty()) {
+            continue;
+        }
+
+        //
+        // Get batch size
+        //
+
+        int batchSize = -1;
+
+        for (const auto& input : stage->inputs()) {
+            auto it = stageInfo.find(input);
+            if (it == stageInfo.end()) {
+                continue;
+            }
+
+            auto curReq = it->second;
+
+            if (curReq == BatchSupport::Split) {
+                if (batchSize < 0) {
+                    batchSize = input->desc().dim(Dim::N, 1);
+                } else {
+                    IE_ASSERT(batchSize == input->desc().dim(Dim::N, 1));
+                }
+            }
+        }
+
+        IE_ASSERT(batchSize > 0);
+
+        for (const auto& output : stage->outputs()) {
+            IE_ASSERT(stageInfo.at(output) == BatchSupport::Split);
+            IE_ASSERT(batchSize == output->desc().dim(Dim::N, 1));
+        }
+
+        if (batchSize == 1) {
+            continue;
+        }
+
+        //
+        // Create tiles and replicate input
+        //
+
+        DataMap<DataVector> inputTiles;
+        DataMap<DataVector> outputTiles;
+
+        for (const auto& input : stage->inputs()) {
+            auto it = stageInfo.find(input);
+            if (it == stageInfo.end()) {
+                continue;
+            }
+
+            auto curReq = it->second;
+            if (curReq == BatchSupport::Split) {
+                auto newDesc = input->desc();
+                newDesc.setDim(Dim::N, 1);
+
+                inputTiles[input].reserve(batchSize);
+                for (int batchInd = 0; batchInd < batchSize; ++batchInd) {
+                    auto postfix = formatString("@batch=%d/%d", batchInd + 1, batchSize);
+
+                    auto inputTile = model->duplicateData(
+                        input,
+                        postfix,
+                        newDesc);
+
+                    inputTiles[input].emplace_back(std::move(inputTile));
+                }
+            } else if (curReq == BatchSupport::ReplicateConstContent) {
+                IE_ASSERT(input->usage() == DataUsage::Const);
+
+                auto& replicatedDatas = input->attrs().getOrSet<ReplicatedDataMap>("replicatedDatas", ReplicatedDataMap());
+                if (replicatedDatas.count(batchSize) == 0) {
+                    auto content = input->content();
+                    IE_ASSERT(content != nullptr);
+
+                    auto perm = input->desc().dimsOrder().toPermutation();
+                    auto dims = input->desc().dims();
+
+                    int maxDimDigit = -1;
+                    for (auto d : perm) {
+                        maxDimDigit = std::max(maxDimDigit, static_cast<int>(d));
+                    }
+                    IE_ASSERT(maxDimDigit >= 0);
+
+                    perm.emplace_back(static_cast<Dim>(maxDimDigit + 1));
+                    dims.set(perm.back(), batchSize);
+
+                    DataDesc newDesc(input->desc().type(), DimsOrder::fromPermutation(perm), dims);
+
+                    replicatedDatas[batchSize] = model->duplicateData(
+                        input,
+                        formatString("@replicated=%d", batchSize),
+                        newDesc,
+                        replicateContent(content, batchSize));
+                }
+            }
+        }
+        for (const auto& output : stage->outputs()) {
+            auto newDesc = output->desc();
+            newDesc.setDim(Dim::N, 1);
+
+            outputTiles[output].reserve(batchSize);
+            for (int batchInd = 0; batchInd < batchSize; ++batchInd) {
+                auto postfix = formatString("@batch=%d/%d", batchInd + 1, batchSize);
+
+                auto outputTile = model->duplicateData(
+                    output,
+                    postfix,
+                    newDesc);
+
+                outputTiles[output].emplace_back(std::move(outputTile));
+            }
+        }
+
+        //
+        // Replicate stage
+        //
+
+        for (int batchInd = 0; batchInd < batchSize; ++batchInd) {
+            auto postfix = formatString("@batch=%d/%d", batchInd + 1, batchSize);
+
+            DataVector newInputs;
+            for (const auto& inEdge : stage->inputEdges()) {
+                if (stageInfo.count(inEdge->input()) == 0) {
+                    newInputs.emplace_back(inEdge->input());
+                    continue;
+                }
+
+                auto curReq = stageInfo[inEdge->input()];
+
+                if (curReq == BatchSupport::Split) {
+                    newInputs.emplace_back(inputTiles.at(inEdge->input())[batchInd]);
+                } else if (curReq == BatchSupport::ReplicateConstContent) {
+                    const auto& replicatedDatas = inEdge->input()->attrs().get<ReplicatedDataMap>("replicatedDatas");
+                    newInputs.emplace_back(replicatedDatas.at(batchSize));
+                }
+            }
+
+            DataVector newOutputs;
+            for (const auto& output : stage->outputs()) {
+                newOutputs.emplace_back(outputTiles.at(output)[batchInd]);
+            }
+
+            auto tileStage = model->duplicateStage(
+                stage->name() + postfix,
+                stage,
+                newInputs,
+                newOutputs);
+
+            tileStage->attrs().set<int>("batchInd", batchInd);
+
+            if (stage->type() == StageType::StubConv) {
+                tileStage->attrs().set("origConvOutput", newOutputs[0]->desc());
+            }
+        }
+
+        //
+        // Create split/concat stages
+        //
+
+        model->disconnectStageDatas(stage);
+
+        for (const auto& p : inputTiles) {
+            _stageBuilder->addSplitStage(
+                model,
+                stage->name() + "@split-batch",
+                stage->origLayer(),
+                Dim::N,
+                p.first,
+                p.second);
+        }
+
+        for (const auto& p : outputTiles) {
+            _stageBuilder->addConcatStage(
+                model,
+                stage->name() + "@concat-batch",
+                stage->origLayer(),
+                Dim::N,
+                p.second,
+                p.first);
+        }
+
+        //
+        // Remove original stage
+        //
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::adjustDataBatch() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_layout.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_layout.cpp
new file mode 100644 (file)
index 0000000..9e261bb
--- /dev/null
@@ -0,0 +1,476 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <unordered_set>
+#include <unordered_map>
+#include <list>
+#include <memory>
+#include <string>
+#include <set>
+#include <algorithm>
+
+namespace vpu {
+
+namespace {
+
+class ConvertOrderStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ConvertOrderStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        return DataMap<DimsOrder>();
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::CanBeLimited;
+    }
+
+    void finalCheckImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inDimsOrder = input->desc().dimsOrder();
+        auto outDimsOrder = output->desc().dimsOrder();
+        IE_ASSERT(inDimsOrder.numDims() == outDimsOrder.numDims());
+        IE_ASSERT(isOrdersCompatible(inDimsOrder, outDimsOrder));
+
+        for (const auto& p : input->desc().dims()) {
+            IE_ASSERT(p.second == output->desc().dim(p.first));
+        }
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inDimsOrder = input->desc().dimsOrder();
+        auto outDimsOrder = output->desc().dimsOrder();
+
+        IE_ASSERT(inDimsOrder.numDims() == outDimsOrder.numDims());
+        IE_ASSERT(isOrdersCompatible(inDimsOrder, outDimsOrder));
+
+        for (const auto& p : input->desc().dims()) {
+            IE_ASSERT(p.second == output->desc().dim(p.first));
+        }
+
+        auto operm = output->desc().dimsOrder().toPermutation();
+        auto iind = input->desc().dimsOrder().toIndices();
+        IE_ASSERT(operm.size() == iind.size());
+
+        int i = 0;
+        for (; i < input->desc().numDims(); i++) {
+            serializer.append(static_cast<uint32_t>(iind[operm[i]]));
+        }
+        for (; i < MAX_DIMS_32; i++) {
+            serializer.append(static_cast<uint32_t>(-1));
+        }
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    Data addConvertedData(
+            const Model::Ptr& model,
+            const Data& orig,
+            DimsOrder order);
+
+    Data addConvertedData(
+            const Model::Ptr& model,
+            const Data& orig,
+            const StridesRequirement& reqs);
+
+    void convertDataLayout(
+            const Model::Ptr& model,
+            const Stage& baseStage,
+            const std::string& postfix,
+            const Data& input,
+            const Data& output);
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(adjustDataLayout);
+
+    //
+    // Init StridesRequirement for fixed Datas
+    //
+
+    for (const auto& data : model->datas()) {
+        if (data->usage() == DataUsage::Intermediate)
+            continue;
+
+        data->updateRequiredStrides(StridesRequirement::compact());
+    }
+
+    //
+    // Adjust Data DimsOrder.
+    //
+
+    {
+        for (const auto& stage : model->getStages()) {
+            auto curStageInfo = stage->propagateDataOrder();
+
+            //
+            // Check inputs.
+            //
+
+            for (const auto& inEdge : stage->inputEdges()) {
+                auto input = inEdge->input();
+
+                auto orderIt = curStageInfo.find(input);
+                if (orderIt == curStageInfo.end()) {
+                    continue;
+                }
+
+                auto requiredOrder = orderIt->second;
+
+                if (input->desc().dimsOrder() == requiredOrder) {
+                    continue;
+                }
+
+                auto& convertedData = input->attrs().getOrSet<DataVector>("convertedData", DataVector());
+
+                Data newInput;
+
+                for (const auto& data : convertedData) {
+                    if (data->desc().dimsOrder() == requiredOrder) {
+                        newInput = data;
+                        break;
+                    }
+                }
+
+                if (newInput == nullptr) {
+                    newInput = addConvertedData(model, input, requiredOrder);
+                    convertDataLayout(model, stage, formatString("input=%d", inEdge->portInd()), input, newInput);
+                    convertedData.emplace_back(newInput);
+                }
+
+                model->replaceStageInput(inEdge, newInput);
+            }
+
+            //
+            // Check outputs.
+            //
+
+            for (const auto& outEdge : stage->outputEdges()) {
+                auto output = outEdge->output();
+                auto portInd = outEdge->portInd();
+
+                auto requiredOrder = output->desc().dimsOrder();
+
+                auto orderIt = curStageInfo.find(output);
+                if (orderIt != curStageInfo.end()) {
+                    requiredOrder = orderIt->second;
+                } else {
+                    //
+                    // Check consumers.
+                    //
+
+                    for (const auto& consumer : output->consumers()) {
+                        auto consumerInfo = consumer->propagateDataOrder();
+                        auto consumerOrderIt = consumerInfo.find(output);
+                        if (consumerOrderIt != consumerInfo.end()) {
+                            requiredOrder = consumerOrderIt->second;
+                            break;
+                        }
+                    }
+                }
+
+                if (output->desc().dimsOrder() == requiredOrder) {
+                    continue;
+                }
+
+                auto newOutput = addConvertedData(model, output, requiredOrder);
+
+                model->replaceStageOutput(outEdge, newOutput);
+
+                if (output->usage() == DataUsage::Output) {
+                    //
+                    // It is a network output, need to insert convert stage.
+                    //
+
+                    convertDataLayout(model, stage, formatString("output=%d", portInd), newOutput, output);
+                } else {
+                    IE_ASSERT(output->usage() == DataUsage::Intermediate);
+
+                    //
+                    // Just change the order of output, its consumers will convert it if needed.
+                    //
+
+                    for (const auto& consumerEdge : output->consumerEdges()) {
+                        model->replaceStageInput(consumerEdge, newOutput);
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Adjust Data strides.
+    //
+
+    {
+        for (const auto& stage : model->getStages()) {
+            auto curStageInfo = stage->getDataStridesRequirements();
+
+            //
+            // Check inputs.
+            //
+
+            for (const auto& inEdge : stage->inputEdges()) {
+                auto input = inEdge->input();
+
+                auto requiredStrides = StridesRequirement();
+
+                auto strideIt = curStageInfo.find(input);
+                if (strideIt != curStageInfo.end()) {
+                    requiredStrides = strideIt->second;
+                }
+
+                if (input->checkStrides(requiredStrides)) {
+                    input->updateRequiredStrides(requiredStrides);
+                    continue;
+                }
+
+                auto& convertedData = input->attrs().getOrSet<DataVector>("convertedData", DataVector());
+
+                Data newInput;
+
+                for (const auto& data : convertedData) {
+                    if (data->desc().dimsOrder() == input->desc().dimsOrder() &&
+                        data->checkStrides(requiredStrides)) {
+                        newInput = data;
+                        break;
+                    }
+                }
+
+                if (newInput == nullptr) {
+                    newInput = addConvertedData(model, input, requiredStrides);
+
+                    _stageBuilder->addCopyStage(
+                        model,
+                        formatString("%s@input=%d@align-strides", stage->name(), inEdge->portInd()),
+                        stage->origLayer(),
+                        input,
+                        newInput);
+
+                    convertedData.emplace_back(newInput);
+                }
+
+                model->replaceStageInput(inEdge, newInput);
+            }
+
+            //
+            // Check outputs.
+            //
+
+            for (const auto& outEdge : stage->outputEdges()) {
+                auto output = outEdge->output();
+                auto portInd = outEdge->portInd();
+
+                auto requiredStrides = StridesRequirement();
+
+                auto strideIt = curStageInfo.find(output);
+
+                if (strideIt != curStageInfo.end()) {
+                    requiredStrides = strideIt->second;
+                }
+
+                //
+                // Check consumers.
+                //
+
+                for (const auto& consumer : output->consumers()) {
+                    auto consumerInfo = consumer->getDataStridesRequirements();
+                    auto consumerStrideIt = consumerInfo.find(output);
+                    if (consumerStrideIt != consumerInfo.end()) {
+                        auto consumerRequiredStrides = consumerStrideIt->second;
+
+                        for (int i = 0; i < output->desc().numDims(); ++i) {
+                            if (requiredStrides.get(i) == DimStride::Any) {
+                                if (consumerRequiredStrides.get(i) != DimStride::Any) {
+                                    requiredStrides.add(i, consumerRequiredStrides.get(i));
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if (output->checkStrides(requiredStrides)) {
+                    output->updateRequiredStrides(requiredStrides);
+                    continue;
+                }
+
+                auto newOutput = addConvertedData(model, output, requiredStrides);
+
+                model->replaceStageOutput(outEdge, newOutput);
+
+                if (output->usage() == DataUsage::Output) {
+                    //
+                    // It is a network output, need to insert convert stage.
+                    //
+
+                    _stageBuilder->addCopyStage(
+                        model,
+                        formatString("%s@input=%d@align-strides", stage->name(), portInd),
+                        stage->origLayer(),
+                        newOutput,
+                        output);
+                } else {
+                    IE_ASSERT(output->usage() == DataUsage::Intermediate);
+
+                    //
+                    // Just change the order of output, its consumers will convert it if needed.
+                    //
+
+                    for (const auto& consumerEdge : output->consumerEdges()) {
+                        model->replaceStageInput(consumerEdge, newOutput);
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Final adjustment and check.
+    //
+
+    {
+        for (const auto& stage : model->getStages()) {
+            stage->finalizeDataLayout();
+
+            auto requiredOrder = stage->propagateDataOrder();
+            auto requiredStrides = stage->getDataStridesRequirements();
+
+            for (const auto& input : stage->inputs()) {
+                auto orderIt = requiredOrder.find(input);
+                if (orderIt != requiredOrder.end()) {
+                    auto requiredOrder = orderIt->second;
+                    IE_ASSERT(input->desc().dimsOrder() == requiredOrder);
+                }
+
+                auto strideIt = requiredStrides.find(input);
+                if (strideIt != requiredStrides.end()) {
+                    auto requiredStrides = strideIt->second;
+                    IE_ASSERT(input->checkStrides(requiredStrides));
+                }
+
+                if (input->usage() == DataUsage::Const) {
+                    IE_ASSERT(input->checkStrides(StridesRequirement::compact()));
+                }
+            }
+
+            for (const auto& output : stage->outputs()) {
+                auto orderIt = requiredOrder.find(output);
+                if (orderIt != requiredOrder.end()) {
+                    auto requiredOrder = orderIt->second;
+                    IE_ASSERT(output->desc().dimsOrder() == requiredOrder);
+                }
+
+                auto strideIt = requiredStrides.find(output);
+                if (strideIt != requiredStrides.end()) {
+                    auto requiredStrides = strideIt->second;
+                    IE_ASSERT(output->checkStrides(requiredStrides));
+                }
+            }
+        }
+    }
+}
+
+Data PassImpl::addConvertedData(
+        const Model::Ptr& model,
+        const Data& orig,
+        DimsOrder order) {
+    auto newDesc = orig->desc();
+    newDesc.reorder(order);
+
+    return model->duplicateData(
+        orig,
+        formatString("@order=%s", order),
+        newDesc);
+}
+
+Data PassImpl::addConvertedData(
+        const Model::Ptr& model,
+        const Data& orig,
+        const StridesRequirement& reqs) {
+    auto data = model->duplicateData(
+        orig,
+        "@adjust-strides");
+    data->resetRequiredStrides();
+    data->updateRequiredStrides(reqs);
+
+    return data;
+}
+
+void PassImpl::convertDataLayout(
+        const Model::Ptr& model,
+        const Stage& baseStage,
+        const std::string& postfix,
+        const Data& input,
+        const Data& output) {
+    IE_ASSERT(input->desc().dims() == output->desc().dims());
+
+    model->addNewStage<ConvertOrderStage>(
+        formatString("%s@%s@reorder=%s", baseStage->name(), postfix, output->desc().dimsOrder()),
+        StageType::Permute,
+        baseStage->origLayer(),
+        {input},
+        {output});
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::adjustDataLayout() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_location.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/adjust_data_location.cpp
new file mode 100644 (file)
index 0000000..8f6ed78
--- /dev/null
@@ -0,0 +1,393 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <algorithm>
+#include <queue>
+#include <set>
+#include <memory>
+
+#include <vpu/allocator.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/extra.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    void copyHwNetOutputs(const Model::Ptr& model);
+    void collectMemReqs(const Model::Ptr& model);
+    void allocNonIntermediateData(const Model::Ptr& model);
+    void adjustModelForMemReqs(const Model::Ptr& model);
+    void copyHwMisalignedInput(const Model::Ptr& model);
+    void packDataInCmx(const Model::Ptr& model);
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(adjustDataLocation);
+
+    const auto& env = CompileEnv::get();
+
+    copyHwNetOutputs(model);
+    collectMemReqs(model);
+    allocNonIntermediateData(model);
+    adjustModelForMemReqs(model);
+    copyHwMisalignedInput(model);
+    if (env.config.packDataInCmx.getOrDefault(true)) {
+        packDataInCmx(model);
+    }
+}
+
+//
+// Add Copy if HW operation writes to network outputs
+//
+
+void PassImpl::copyHwNetOutputs(const Model::Ptr& model) {
+    VPU_PROFILE(copyHwNetOutputs);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::HW) {
+            continue;
+        }
+
+        auto output = stage->output(0);
+
+        if (output->usage() == DataUsage::Output) {
+            auto newOutput = model->duplicateData(
+                output,
+                "@intermediate");
+
+            model->replaceStageOutput(stage->outputEdge(0), newOutput);
+
+            _stageBuilder->addCopyStage(
+                model,
+                stage->name() + "@flush-output",
+                stage->origLayer(),
+                newOutput,
+                output);
+        }
+    }
+}
+
+//
+// Collect datas memory requirements
+//
+
+namespace {
+
+inline void setDataMemReqs(const Data& data, MemoryType memType) {
+    loopOverData(data->getTopParentData(), [memType](const Data& subData) {
+        subData->setMemReqs(memType);
+        return DataLoopStatus::NextChild;
+    });
+}
+
+}  // namespace
+
+void PassImpl::collectMemReqs(const Model::Ptr& model) {
+    VPU_PROFILE(collectMemReqs);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::HW) {
+            continue;
+        }
+
+        auto hwInput = stage->input(0);
+        auto hwOutput = stage->output(0);
+
+        //
+        // Pattern matching
+        //
+
+        if (stage->attrs().get<HwOpType>("hwOpType") == HwOpType::CONV &&
+            stage->attrs().get<int>("kernelSizeX") == 3 && stage->attrs().get<int>("kernelSizeY") == 3 &&
+            stage->attrs().get<int>("kernelStride") == 1 &&
+            stage->attrs().get<HwPaddingInfo>("pad").enable &&
+            hwInput->desc().dim(Dim::W) == 13 && hwInput->desc().dim(Dim::H) == 13 && hwInput->desc().dim(Dim::C) == 352 &&
+            hwOutput->desc().dim(Dim::C) == 1024) {
+            setDataMemReqs(hwInput, MemoryType::CMX);
+        } else {
+            setDataMemReqs(hwOutput, MemoryType::CMX);
+        }
+    }
+}
+
+//
+// Allocate Const/Input/Output datas
+//
+
+void PassImpl::allocNonIntermediateData(const Model::Ptr& model) {
+    VPU_PROFILE(allocNonIntermediateData);
+
+    auto& allocator = model->getAllocator();
+
+    auto preprocessRes = allocator.preprocess(model);
+    IE_ASSERT(preprocessRes.status == AllocationStatus::OK);
+}
+
+//
+// Analyse the network from the beginning several times,
+// until we satisfy all requirements or we found that it can't be done
+//
+
+void PassImpl::adjustModelForMemReqs(const Model::Ptr& model) {
+    VPU_PROFILE(adjustModelForMemReqs);
+
+    auto& allocator = model->getAllocator();
+
+    for (;;) {
+        auto allocRes = runAllocator(model);
+        if (allocRes.status == AllocationStatus::OK)
+            break;
+
+        auto failedStage = allocRes.failedStage;
+        IE_ASSERT(failedStage != nullptr);
+
+        auto failedStageInd = failedStage->index();
+        IE_ASSERT(failedStageInd >= 0);
+
+        //
+        // Try to flush Data allocated in CMX
+        //
+
+        auto allCmxDatas = allocator.getAllocatedDatas(MemoryType::CMX);
+
+        if (allCmxDatas.empty()) {
+            if (allocRes.status == AllocationStatus::SHAVES_FAILED) {
+                VPU_THROW_EXCEPTION
+                    << "Can't allocate SHAVEs for stage " << failedStage->name();
+            } else {
+                VPU_THROW_EXCEPTION
+                    << "Can't satisfy data location requirements for stage " << failedStage->name();
+            }
+        }
+
+        StageInputVector cmxConsumerEdges;
+        cmxConsumerEdges.reserve(allCmxDatas.size() * 4);
+
+        for (const auto& cmxData : allCmxDatas) {
+            IE_ASSERT(cmxData->usage() == DataUsage::Intermediate);
+            IE_ASSERT(cmxData->parentDataEdge() == nullptr);
+
+            auto cmxDataProducer = cmxData->producer();
+            IE_ASSERT(cmxDataProducer != nullptr);
+
+            auto cmxDataProducerInd = cmxDataProducer->index();
+            IE_ASSERT(cmxDataProducerInd >= 0);
+            IE_ASSERT(cmxDataProducerInd < failedStageInd);
+
+            IE_ASSERT(cmxData->numConsumers() > 0);
+
+            for (const auto& consumerEdge : cmxData->consumerEdges()) {
+                if (consumerEdge->consumer()->attrs().getOrDefault<bool>("CMX-to-DDR", false)) {
+                    continue;
+                }
+
+                cmxConsumerEdges.emplace_back(consumerEdge);
+            }
+        }
+
+        for (const auto& cmxConsumerEdge : cmxConsumerEdges) {
+            auto cmxData = cmxConsumerEdge->input();
+            auto cmxConsumer = cmxConsumerEdge->consumer();
+
+            auto& ddrCopies = cmxData->attrs().getOrSet<DataVector>("ddrCopies", DataVector());
+            ddrCopies.reserve(1);
+
+            const auto& strideReqsInfo = cmxConsumer->getDataStridesRequirements();
+
+            Data ddrCopy;
+            for (const auto& ddrCandidate : ddrCopies) {
+                if (strideReqsInfo.count(cmxConsumerEdge->input()) != 0) {
+                    const auto& strideReqs = strideReqsInfo.at(cmxConsumerEdge->input());
+
+                    if (!ddrCandidate->checkStrides(strideReqs)) {
+                        continue;
+                    }
+                }
+
+                ddrCopy = ddrCandidate;
+                break;
+            }
+
+            if (ddrCopy == nullptr) {
+                ddrCopy = model->duplicateData(
+                    cmxData,
+                    "@DDR-copy");
+
+                if (strideReqsInfo.count(cmxConsumerEdge->input()) != 0) {
+                    const auto& strideReqs = strideReqsInfo.at(cmxConsumerEdge->input());
+                    ddrCopy->updateRequiredStrides(strideReqs);
+                }
+                ddrCopy->setMemReqs(MemoryType::DDR);
+
+                auto copyStage = _stageBuilder->addCopyStage(
+                    model,
+                    formatString("%s@move-to-DDR", cmxData->name()),
+                    failedStage->origLayer(),
+                    cmxData,
+                    ddrCopy);
+
+                copyStage->attrs().set<bool>("CMX-to-DDR", true);
+
+                ddrCopies.emplace_back(ddrCopy);
+            }
+
+            model->replaceStageInput(cmxConsumerEdge, ddrCopy);
+
+            for (const auto& childDataEdge : cmxData->childDataEdges()) {
+                auto order = childDataEdge->order();
+
+                if (order == SharedDataOrder::ParentWritesToChild &&
+                    childDataEdge->connection() == cmxConsumer) {
+                    auto childData = childDataEdge->child();
+
+                    model->connectDatas()
+                        .parent(ddrCopy)
+                        .child(childData)
+                        .mode(childDataEdge->mode())
+                        .order(order)
+                        .offset(childDataEdge->attrs().getOrDefault<DimValues>("offset", DimValues()))
+                        .done();
+
+                    loopOverData(childData, [](const Data& subData) {
+                        subData->setMemReqs(MemoryType::DDR);
+                        return DataLoopStatus::NextChild;
+                    });
+                }
+            }
+        }
+    }
+}
+
+//
+// Add Copy if HW operation reads misaligned input
+//
+
+void PassImpl::copyHwMisalignedInput(const Model::Ptr& model) {
+    VPU_PROFILE(copyHwMisalignedInput);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::HW) {
+            continue;
+        }
+
+        auto input = stage->input(0);
+        IE_ASSERT(input->location() != DataLocation::None);
+
+        if (input->memoryOffset() % 16 != 0) {
+            auto newInput = model->duplicateData(
+                input,
+                "@aligned-ptr");
+            newInput->setMemReqs(MemoryType::DDR);
+
+            _stageBuilder->addCopyStage(
+                model,
+                stage->name() + "@align-input-ptr",
+                stage->origLayer(),
+                input,
+                newInput);
+
+            model->replaceStageInput(stage->inputEdge(0), newInput);
+        }
+    }
+}
+
+//
+// Try to put HW inputs to CMX if possible
+//
+
+void PassImpl::packDataInCmx(const Model::Ptr& model) {
+    VPU_PROFILE(packDataInCmx);
+
+    auto& allocator = model->getAllocator();
+
+    //
+    // Collect candidates
+    //
+
+    std::queue<Data> candidatesForCMX;
+
+    auto& visitedDatas = allocator.getCandidatesForCMX();
+    visitedDatas.clear();
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::HW)
+            continue;
+
+        for (const auto& input : stage->inputs()) {
+            auto topParent = input->getTopParentData();
+
+            if (topParent->usage() != DataUsage::Intermediate)
+                continue;
+
+            auto topParentMemType = topParent->memReqs();
+            if (topParentMemType == MemoryType::CMX)
+                continue;
+
+            auto producer = input->producer();
+            IE_ASSERT(producer != nullptr);
+
+            if (producer->type() == StageType::Copy &&
+                producer->attrs().getOrDefault<bool>("CMX-to-DDR", false)) {
+                continue;
+            }
+
+            if (producer->getSHAVEsRequirements() != StageSHAVEsRequirements::NeedMax) {
+                if (visitedDatas.count(topParent) == 0) {
+                    candidatesForCMX.push(topParent);
+                    visitedDatas.insert(topParent);
+                }
+            }
+        }
+    }
+
+    //
+    // Try candidates one by one -> if allocation cycle is successfull, leave the data in CMX
+    //
+
+    while (!candidatesForCMX.empty()) {
+        auto curCandidate = candidatesForCMX.front();
+        candidatesForCMX.pop();
+
+        IE_ASSERT(curCandidate->parentDataEdge() == nullptr);
+        IE_ASSERT(curCandidate->usage() == DataUsage::Intermediate);
+
+        auto curMemoryType = curCandidate->memReqs();
+        IE_ASSERT(curMemoryType == MemoryType::DDR);
+
+        loopOverData(curCandidate, [](const Data& subData) {
+            subData->setMemReqs(MemoryType::CMX);
+            return DataLoopStatus::NextChild;
+        });
+
+        auto allocRes = runAllocator(model, true);
+        if (allocRes.status != AllocationStatus::OK) {
+            loopOverData(curCandidate, [](const Data& subData) {
+                subData->setMemReqs(MemoryType::DDR);
+                return DataLoopStatus::NextChild;
+            });
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::adjustDataLocation() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/allocate_resources.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/allocate_resources.cpp
new file mode 100644 (file)
index 0000000..af6c5f7
--- /dev/null
@@ -0,0 +1,212 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <unordered_map>
+#include <list>
+#include <unordered_set>
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <string>
+#include <set>
+#include <queue>
+#include <memory>
+
+#include <vpu/allocator.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+
+namespace vpu {
+
+//
+// runAllocator
+//
+
+AllocationResult runAllocator(const Model::Ptr& model, bool onlyCheckCMX) {
+    VPU_PROFILE(runAllocator);
+
+    auto& allocator = model->getAllocator();
+
+    //
+    // Clear previous allocation.
+    //
+
+    allocator.reset();
+
+    //
+    // Allocate Const/Input/Output datas.
+    //
+
+    if (!onlyCheckCMX) {
+        auto result = allocator.preprocess(model);
+        if (result.status != vpu::AllocationStatus::OK) {
+            return result;
+        }
+    }
+
+    //
+    // Allocate resources per stage.
+    //
+
+    for (const auto& stage : model->getStages()) {
+        //
+        // Release SHAVEs in any case at the end of iteration.
+        //
+
+        stage->setNumSHAVEs(0);
+        AutoScope scope([&allocator]() {
+            allocator.getAllocatorOfShaves().freeSHAVEs();
+        });
+
+        //
+        // Get stage SHAVE requirements.
+        //
+
+        auto reqs = stage->getSHAVEsRequirements();
+
+        //
+        // Allocate SHAVEs for NeedMax before the Data allocation.
+        //
+
+        if (reqs == StageSHAVEsRequirements::NeedMax) {
+            if (!allocator.getAllocatorOfShaves().allocateSHAVEs(stage, reqs)) {
+                allocator.setNeedToAllocNonIntermData();
+
+                AllocationResult res;
+                res.status = AllocationStatus::SHAVES_FAILED;
+                res.failedStage = stage;
+                return res;
+            }
+        }
+
+        //
+        // Allocate stage outputs.
+        //
+
+        for (const auto& output : stage->outputs()) {
+            if (onlyCheckCMX && output->memReqs() != MemoryType::CMX) {
+                continue;
+            }
+
+            if (!allocator.allocateData(output)) {
+                if (output->memReqs() == MemoryType::CMX && !onlyCheckCMX) {
+                    if (allocator.removeCMXCandidates(output)) {
+                        if (allocator.allocateData(output)) {
+                            continue;
+                        }
+                    }
+
+                    allocator.setNeedToAllocNonIntermData();
+                }
+
+                AllocationResult res;
+                res.status = AllocationStatus::DATA_FAILED;
+                res.failedStage = stage;
+                return res;
+            }
+        }
+
+        //
+        // Allocate stage temporary buffers.
+        //
+
+        if (!onlyCheckCMX) {
+            for (const auto& tempBufferEdge : stage->tempBufferEdges()) {
+                if (!allocator.allocateData(tempBufferEdge->tempBuffer())) {
+                    allocator.setNeedToAllocNonIntermData();
+
+                    AllocationResult res;
+                    res.status = AllocationStatus::DATA_FAILED;
+                    res.failedStage = stage;
+                    return res;
+                }
+            }
+        }
+
+        //
+        // Allocate limited SHAVEs after the Data allocation.
+        //
+
+        if (reqs != StageSHAVEsRequirements::NeedMax) {
+            if (!allocator.getAllocatorOfShaves().allocateSHAVEs(stage, reqs)) {
+                allocator.setNeedToAllocNonIntermData();
+
+                AllocationResult res;
+                res.status = AllocationStatus::SHAVES_FAILED;
+                res.failedStage = stage;
+                return res;
+            }
+        }
+
+        //
+        // Release stage inputs.
+        //
+
+        for (const auto& input : stage->inputs()) {
+            if (onlyCheckCMX && input->memReqs() != MemoryType::CMX) {
+                continue;
+            }
+
+            allocator.freeData(input);
+        }
+
+        //
+        // Release stage temporary buffers.
+        //
+
+        if (!onlyCheckCMX) {
+            for (const auto& tempBufferEdge : stage->tempBufferEdges()) {
+                allocator.freeData(tempBufferEdge->tempBuffer());
+            }
+        }
+    }
+
+    return AllocationResult();
+}
+
+//
+// allocateResources
+//
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(allocateResources);
+
+    auto& allocator = model->getAllocator();
+
+    //
+    // Allocate all resources
+    //
+
+    auto allocRes = runAllocator(model);
+    IE_ASSERT(allocRes.status == AllocationStatus::OK);
+
+    //
+    // Allocator self-check
+    //
+
+    allocator.selfCheck();
+
+    //
+    // Allocation statistics
+    //
+
+    model->attrs().set<UsedMemory>("usedMemory", allocator.usedMemory());
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::allocateResources() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/eliminate_copy.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/eliminate_copy.cpp
new file mode 100644 (file)
index 0000000..6a97c96
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <unordered_set>
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+#include <queue>
+
+#include <vpu/allocator.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) :
+            _stageBuilder(stageBuilder) {
+    }
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    static bool isApplicable(const Stage& copyStage);
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+bool PassImpl::isApplicable(const Stage& copyStage) {
+    auto copyInput = copyStage->input(0);
+    auto copyOutput = copyStage->output(0);
+
+    IE_ASSERT(copyInput->usage() == DataUsage::Intermediate);
+    IE_ASSERT(copyOutput->usage() == DataUsage::Intermediate);
+    IE_ASSERT(copyInput->producerEdge() != nullptr);
+    IE_ASSERT(copyInput->desc().dimsOrder() == copyOutput->desc().dimsOrder());
+
+    if (copyInput->parentDataEdge() != nullptr) {
+        return false;
+    }
+    if (copyInput->numChildDatas() > 0) {
+        return false;
+    }
+
+    if (!checkStrides(copyInput->desc(), copyOutput->strides(), copyInput->requiredStrides())) {
+        return false;
+    }
+    if (!checkStrides(copyOutput->desc(), copyInput->strides(), copyOutput->requiredStrides())) {
+        return false;
+    }
+
+    auto copyOutputTopParent = copyOutput->getTopParentData();
+    if (copyOutputTopParent->usage() != DataUsage::Intermediate) {
+        return false;
+    }
+
+    IE_ASSERT(copyOutput->numConsumers() == 1);
+
+    auto specialConsumer = copyOutput->singleConsumer();
+    IE_ASSERT(specialConsumer->category() == StageCategory::Special);
+
+    return true;
+}
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(eliminateCopyStages);
+
+    const int nMaxCopyStages = 23000;
+    const auto& env = CompileEnv::get();
+
+    std::queue<Stage> copyToRemove;
+
+    if (!env.config.copyOptimization.hasValue()) {
+        int nCopyStages = 0;
+        for (const auto& stage : model->getStages()) {
+            if (stage->type() == StageType::Copy) {
+                ++nCopyStages;
+            }
+        }
+
+        // Eliminate copy will take more than an hour in that case
+        if (nCopyStages > nMaxCopyStages) {
+            env.log->warning(
+                "Pass [eliminateCopyStages] SKIPPED : number of copy stages (%d) is larger than threshold %d",
+                nCopyStages, nMaxCopyStages);
+            return;
+        }
+    }
+
+    for (const auto& copyStage : model->getStages()) {
+        if (copyStage->type() != StageType::Copy) {
+            continue;
+        }
+
+        auto isOptional = copyStage->attrs().getOrDefault<bool>("optional", false);
+        if (!isOptional) {
+            continue;
+        }
+
+        if (isApplicable(copyStage)) {
+            copyToRemove.push(copyStage);
+        }
+    }
+
+    while (!copyToRemove.empty()) {
+        auto copyStage = copyToRemove.front();
+        copyToRemove.pop();
+
+        if (!isApplicable(copyStage)) {
+            continue;
+        }
+
+        auto copyInput = copyStage->input(0);
+        auto copyOutput = copyStage->output(0);
+
+        auto copyOutputTopParent = copyOutput->getTopParentData();
+
+        auto copyStageName = copyStage->name();
+        auto copyOrigLayer = copyStage->origLayer();
+
+        auto copyProducer = copyInput->producer();
+        auto specialConsumer = copyOutput->singleConsumer();
+
+        //
+        // Try to remove Copy and redirect (copyProducer) to [copyOutput] and ask CMX location for it.
+        // Run allocation and if it fails -> revert the changes in the Model.
+        //
+
+        model->removeStage(copyStage);
+
+        auto oldMemoryType = copyOutputTopParent->memReqs();
+#ifndef NDEBUG
+        loopOverData(copyOutputTopParent, [oldMemoryType](const Data& subData) {
+            auto subMemType = subData->memReqs();
+            IE_ASSERT(subMemType == oldMemoryType);
+            return DataLoopStatus::NextChild;
+        });
+#endif
+        if (oldMemoryType != MemoryType::CMX) {
+            loopOverData(copyOutputTopParent, [](const Data& subData) {
+                subData->setMemReqs(MemoryType::CMX);
+                return DataLoopStatus::NextChild;
+            });
+        }
+
+        model->replaceStageOutput(copyProducer->outputEdge(0), copyOutput);
+
+        StageInputVector prevEdges;
+        prevEdges.reserve(copyInput->numConsumers());
+        for (const auto& consumerEdge : copyInput->consumerEdges()) {
+            prevEdges.emplace_back(consumerEdge);
+            model->replaceStageInput(consumerEdge, copyOutput);
+        }
+
+        auto allocRes = runAllocator(model, true);
+        if (allocRes.status != AllocationStatus::OK) {
+            model->replaceStageOutput(copyProducer->outputEdge(0), copyInput);
+
+            for (const auto& p : prevEdges) {
+                model->replaceStageInput(p, copyInput);
+            }
+
+            _stageBuilder->addCopyStage(model, copyStageName, copyOrigLayer, copyInput, copyOutput);
+
+            loopOverData(copyOutputTopParent, [oldMemoryType](const Data& subData) {
+                subData->setMemReqs(oldMemoryType);
+                return DataLoopStatus::NextChild;
+            });
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::eliminateCopyStages() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/final_check.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/final_check.cpp
new file mode 100644 (file)
index 0000000..ffcf73c
--- /dev/null
@@ -0,0 +1,340 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <memory>
+
+#include <vpu/allocator.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    const auto& env = CompileEnv::get();
+
+    //
+    // Check Data requirements.
+    //
+
+    for (const auto& data : model->datas()) {
+        auto topParent = data->getTopParentData();
+
+        //
+        // Memory type.
+        //
+
+        auto memoryType = topParent->memReqs();
+
+        loopOverData(topParent, [memoryType](const Data& subData) {
+            auto subMemType = subData->memReqs();
+            IE_ASSERT(subMemType == memoryType);
+            return DataLoopStatus::NextChild;
+        });
+
+        if (memoryType == MemoryType::CMX) {
+            IE_ASSERT(topParent->location() == DataLocation::CMX);
+        }
+
+        //
+        // Data <-> Data Edges.
+        //
+
+        if (auto dataEdge = data->parentDataEdge()) {
+            auto parent = dataEdge->parent();
+            auto child = dataEdge->child();
+
+            Data producer, consumer;
+            if (dataEdge->order() == SharedDataOrder::ChildWritesToParent) {
+                producer = child;
+                consumer = parent;
+            } else if (dataEdge->order() == SharedDataOrder::ParentWritesToChild) {
+                producer = parent;
+                consumer = child;
+            } else {
+                VPU_THROW_EXCEPTION << "Invalid data order " << dataEdge->order();
+            }
+
+            //
+            // Child must be Intermediate.
+            //
+
+            IE_ASSERT(child->usage() == DataUsage::Intermediate);
+
+            //
+            // Parent can't be Temp or Fake.
+            //
+
+            IE_ASSERT(parent->usage() != DataUsage::Temp && parent->usage() != DataUsage::Fake);
+
+            //
+            // Consumer must be accesible from the producer.
+            //
+
+            Stage connectionStage;
+
+            for (const auto& consumerEdge : producer->consumerEdges()) {
+                for (const auto& outEdge : consumerEdge->consumer()->outputEdges()) {
+                    if (outEdge->output() == consumer) {
+                        connectionStage = consumerEdge->consumer();
+                        break;
+                    }
+                }
+
+                if (connectionStage != nullptr) {
+                    break;
+                }
+            }
+
+            IE_ASSERT(connectionStage != nullptr);
+
+            //
+            // Connection stage must be special.
+            //
+
+            IE_ASSERT(connectionStage->category() == StageCategory::Special);
+
+            //
+            // Special checks for each mode.
+            //
+
+            if (dataEdge->mode() == SharedDataMode::ROI) {
+                //
+                // Check connection stage type and that parent has the largest buffer.
+                //
+
+                if (connectionStage->type() == StageType::Concat ||
+                    connectionStage->type() == StageType::Expand) {
+                    IE_ASSERT(producer == child);
+                    IE_ASSERT(consumer == parent);
+                } else if (connectionStage->type() == StageType::Split ||
+                           connectionStage->type() == StageType::Shrink) {
+                    IE_ASSERT(producer == parent);
+                    IE_ASSERT(consumer == child);
+                } else {
+                    VPU_THROW_EXCEPTION
+                            << "Stage type " << connectionStage->type()
+                            << " can't be used for ROI data connection";
+                }
+
+                //
+                // Parent and child must have the same order.
+                //
+
+                IE_ASSERT(parent->desc().dimsOrder() == child->desc().dimsOrder());
+
+                //
+                // Offset must be valid.
+                //
+
+                for (const auto& p : dataEdge->attrs().getOrDefault<DimValues>("offset", DimValues())) {
+                    IE_ASSERT(parent->desc().dimsOrder().hasDim(p.first));
+
+                    IE_ASSERT(child->desc().dim(p.first) + p.second <= parent->desc().dim(p.first));
+                }
+
+                //
+                // Check strides requirements
+                //
+
+                IE_ASSERT(checkStrides(child->desc(), parent->strides(), child->requiredStrides()));
+            } else if (dataEdge->mode() == SharedDataMode::Reshape) {
+                //
+                // Check connection stage type.
+                //
+
+                IE_ASSERT(connectionStage->type() == StageType::Reshape);
+
+                //
+                // Parent and child must have the same data type.
+                //
+
+                IE_ASSERT(parent->desc().type() == child->desc().type());
+
+                //
+                // Parent and child must have the same number of elements.
+                //
+
+                IE_ASSERT(parent->desc().totalDimSize() == child->desc().totalDimSize());
+
+                //
+                // Parent and child must be compact.
+                //
+
+                // TODO: can we weaken this restriction?
+                IE_ASSERT(parent->checkStrides(StridesRequirement::compact()));
+                IE_ASSERT(child->checkStrides(StridesRequirement::compact()));
+            } else {
+                VPU_THROW_EXCEPTION << "Invalid shared data mode " << dataEdge->mode();
+            }
+        }
+    }
+
+    //
+    // Check Stages requirements.
+    //
+
+    StageMap<int> stageExecIndMap;
+
+    int stageExecInd = 0;
+    for (const auto& stage : model->getStages()) {
+        //
+        // Check that dependencies was calculated
+        //
+
+        auto curStageInd = stage->index();
+        IE_ASSERT(curStageInd >= 0);
+
+        if (stage->category() != StageCategory::Special) {
+            IE_ASSERT(curStageInd >= stageExecInd);
+            stageExecIndMap[stage] = stageExecInd;
+        }
+
+        for (const auto& prevStage : stage->prevStages()) {
+            auto prevStageInd = prevStage->index();
+            IE_ASSERT(prevStageInd >= 0);
+            IE_ASSERT(prevStageInd < curStageInd);
+
+            if (stage->category() != StageCategory::Special && prevStage->category() != StageCategory::Special) {
+                auto prevStageExecInd = stageExecIndMap.at(prevStage);
+                IE_ASSERT(prevStageExecInd < stageExecInd);
+            }
+        }
+
+        if (stage->category() != StageCategory::Special) {
+            ++stageExecInd;
+        }
+
+        //
+        // Check Data DimsOrder requirements
+        //
+
+        auto stageDataDimsOrderMap = stage->propagateDataOrder();
+
+        auto inputs = stage->inputs();
+        auto outputs = stage->outputs();
+
+        for (const auto& input : inputs) {
+            auto it = stageDataDimsOrderMap.find(input);
+            if (it != stageDataDimsOrderMap.end()) {
+                auto requiredOrder = it->second;
+                IE_ASSERT(input->desc().dimsOrder() == requiredOrder);
+            }
+        }
+        for (const auto& output : outputs) {
+            auto it = stageDataDimsOrderMap.find(output);
+            if (it != stageDataDimsOrderMap.end()) {
+                auto requiredOrder = it->second;
+                IE_ASSERT(output->desc().dimsOrder() == requiredOrder);
+            }
+        }
+
+        //
+        // Check Data Strides requirements
+        //
+
+        auto stageDataStridesMap = stage->getDataStridesRequirements();
+
+        for (const auto& input : inputs) {
+            auto it = stageDataStridesMap.find(input);
+            if (it != stageDataStridesMap.end()) {
+                auto requiredStrides = it->second;
+                IE_ASSERT(input->checkStrides(requiredStrides));
+            }
+        }
+        for (const auto& output : outputs) {
+            auto it = stageDataStridesMap.find(output);
+            if (it != stageDataStridesMap.end()) {
+                auto requiredStrides = it->second;
+                IE_ASSERT(output->checkStrides(requiredStrides));
+            }
+        }
+
+        //
+        // Check Data Batch support
+        //
+
+        auto stageBatchSupport = stage->getBatchSupportInfo();
+
+        for (const auto& input : inputs) {
+            auto it = stageBatchSupport.find(input);
+            if (it != stageBatchSupport.end()) {
+                auto requiredBatch = it->second;
+
+                if (requiredBatch == BatchSupport::Split) {
+                    IE_ASSERT(input->desc().dim(Dim::N, 1) == 1);
+                }
+            }
+        }
+        for (const auto& output : outputs) {
+            auto it = stageBatchSupport.find(output);
+            if (it != stageBatchSupport.end()) {
+                auto requiredBatch = it->second;
+
+                if (requiredBatch == BatchSupport::Split) {
+                    IE_ASSERT(output->desc().dim(Dim::N, 1) == 1);
+                }
+            }
+        }
+
+        //
+        // Check SHAVEs requirements
+        //
+
+        auto stageSHAVEsRequirements = stage->getSHAVEsRequirements();
+
+        if (stageSHAVEsRequirements == StageSHAVEsRequirements::NeedMax) {
+            IE_ASSERT(stage->numSHAVEs() == env.resources.numSHAVEs);
+        } else if (stageSHAVEsRequirements == StageSHAVEsRequirements::CanBeLimited) {
+            IE_ASSERT(stage->numSHAVEs() > 0);
+        } else if (stageSHAVEsRequirements == StageSHAVEsRequirements::TwoOrOne) {
+            IE_ASSERT(stage->numSHAVEs() == 1 || stage->numSHAVEs() == 2);
+        } else if (stageSHAVEsRequirements == StageSHAVEsRequirements::OnlyOne) {
+            IE_ASSERT(stage->numSHAVEs() == 1);
+        } else if (stageSHAVEsRequirements == StageSHAVEsRequirements::NotNeeded) {
+            IE_ASSERT(stage->numSHAVEs() == 0);
+        }
+
+        for (const auto& injectedStageEdge : stage->injectedStageEdges()) {
+            auto childStage = injectedStageEdge->child();
+
+            IE_ASSERT(childStage->numSHAVEs() == stage->numSHAVEs());
+
+            auto injectedReqs = childStage->getSHAVEsRequirements();
+
+            if (injectedReqs == StageSHAVEsRequirements::NeedMax) {
+                IE_ASSERT(childStage->numSHAVEs() == env.resources.numSHAVEs);
+            } else if (injectedReqs == StageSHAVEsRequirements::CanBeLimited) {
+                IE_ASSERT(childStage->numSHAVEs() > 0);
+            } else if (injectedReqs == StageSHAVEsRequirements::TwoOrOne) {
+                IE_ASSERT(childStage->numSHAVEs() == 1 || stage->numSHAVEs() == 2);
+            } else if (injectedReqs == StageSHAVEsRequirements::OnlyOne) {
+                IE_ASSERT(childStage->numSHAVEs() == 1);
+            } else if (injectedReqs == StageSHAVEsRequirements::NotNeeded) {
+                IE_ASSERT(childStage->numSHAVEs() == 0);
+            }
+        }
+
+        //
+        // Stage specific checks
+        //
+
+        stage->finalCheck();
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::finalCheck() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/finalize_hw_ops.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/finalize_hw_ops.cpp
new file mode 100644 (file)
index 0000000..4905bb7
--- /dev/null
@@ -0,0 +1,267 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <algorithm>
+#include <set>
+#include <memory>
+
+#include <precision_utils.h>
+
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(finalizeHwOps);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() != StageCategory::HW)
+            continue;
+
+        HwOpList hwOps;
+
+        auto opType = stage->attrs().get<HwOpType>("hwOpType");
+
+        if (opType == HwOpType::CONV || opType == HwOpType::CONV_POOL) {
+            auto input = stage->input(0);
+            auto biases = stage->input(2);
+            auto scales = stage->input(3);
+            auto output = stage->output(0);
+
+            auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+            auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+            auto kernelStride = stage->attrs().get<int>("kernelStride");
+
+            auto pad = stage->attrs().get<HwPaddingInfo>("pad");
+
+            auto tiling = stage->attrs().get<HwConvTileInfo>("tiling");
+
+            auto withReLU = stage->attrs().getOrDefault<bool>("withReLU", false);
+            auto a0 = stage->attrs().getOrDefault<uint32_t>("a0", 0);
+            auto a1 = stage->attrs().getOrDefault<uint32_t>("a1", 0);
+
+            auto withClamp = stage->attrs().getOrDefault<bool>("withClamp", false);
+            auto clampMax = stage->attrs().getOrDefault<float>("clampMax", 6.0);
+
+            auto poolKernelSizeX = stage->attrs().getOrDefault<int>("poolKernelSizeX", 0);
+            auto poolKernelSizeY = stage->attrs().getOrDefault<int>("poolKernelSizeY", 0);
+
+            IE_ASSERT(tiling.numDescr > 0);
+
+            int outChanOffset = 0;
+            for (int outTileIndex = 0; outTileIndex < tiling.numDescr; ++outTileIndex) {
+                auto outNumChans = outTileIndex == tiling.numDescr - 1 ? tiling.lastOutChans : tiling.outChansPerDescr;
+
+                HwOpParams hwOpParams;
+
+                hwOpParams.opType = opType;
+                hwOpParams.opMode = tiling.mode;
+
+                if (pad.enable) {
+                    hwOpParams.withPad = true;
+                    hwOpParams.padMode = HwPadMode::PAD_WITH_ZEROS;
+                }
+
+                int bufInd = 0;
+                hwOpParams.inputInd = bufInd++;
+                hwOpParams.coeffsInd = bufInd++;
+                if (biases->usage() != DataUsage::Fake) {
+                    hwOpParams.biasesInd = bufInd++;
+                }
+                if (scales->usage() != DataUsage::Fake) {
+                    hwOpParams.scalesInd = bufInd++;
+                }
+                hwOpParams.outputInd = bufInd++;
+
+                hwOpParams.outChanOffset = outChanOffset;
+                hwOpParams.outNumChans = outNumChans;
+
+                hwOpParams.kernelWidth = kernelSizeX;
+                hwOpParams.kernelHeight = kernelSizeY;
+                hwOpParams.kernelStride = kernelStride;
+
+                if (opType == HwOpType::CONV_POOL) {
+                    hwOpParams.poolKernelWidth = poolKernelSizeX;
+                    hwOpParams.poolKernelHeight = poolKernelSizeY;
+                }
+
+                if (withReLU) {
+                    hwOpParams.withReLU = true;
+                    hwOpParams.t0 = 0;
+                    hwOpParams.a0 = a0;
+                    hwOpParams.a1 = a1;
+                }
+                if (withClamp) {
+                    hwOpParams.withClamp = true;
+                    hwOpParams.clampMaxVal = clampMax;
+                }
+
+                hwOps.vec.emplace_back(hwOpParams);
+
+                outChanOffset += outNumChans;
+            }
+        } else if (opType == HwOpType::POOL) {
+            auto input = stage->input(0);
+            auto output = stage->output(0);
+
+            auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+            auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+            auto kernelStride = stage->attrs().get<int>("kernelStride");
+
+            auto poolType = stage->attrs().get<HwPoolType>("poolType");
+
+            auto pad = stage->attrs().get<HwPaddingInfo>("pad");
+
+            auto tiling = stage->attrs().get<HwPoolTileInfo>("tiling");
+
+            auto withReLU = stage->attrs().get<bool>("withReLU");
+
+            auto origDimC = output->desc().dim(Dim::C);
+            auto origDimN = output->desc().dim(Dim::N, 1);
+
+            auto hwDimC = origDimN * origDimC;
+
+            IE_ASSERT(tiling.numDescr > 0);
+
+            int chanOffset = 0;
+            for (int outTileIndex = 0; outTileIndex < tiling.numDescr; ++outTileIndex) {
+                auto numChans =
+                    outTileIndex == tiling.numDescr - 1 ?
+                        hwDimC - outTileIndex * tiling.chansPerDescr :
+                        tiling.chansPerDescr;
+
+                HwOpParams hwOpParams;
+
+                hwOpParams.opType = opType;
+                hwOpParams.opMode = tiling.mode;
+
+                hwOpParams.poolType = poolType;
+
+                if (pad.enable) {
+                    HwPadMode padType = HwPadMode::PAD_WITH_ZEROS;
+
+                    if (poolType == HwPoolType::MAX) {
+                        if (pad.left > 0) {
+                            padType = padType | HwPadMode::PAD_REPEAT_LEFT_EDGE;
+                        }
+                        if (pad.right > 0) {
+                            padType = padType | HwPadMode::PAD_REPEAT_RIGHT_EDGE;
+                        }
+                        if (pad.top > 0) {
+                            padType = padType | HwPadMode::PAD_REPEAT_TOP_EDGE;
+                        }
+                        if (pad.bottom > 0) {
+                            padType = padType | HwPadMode::PAD_REPEAT_BOTTOM_EDGE;
+                        }
+                    }
+
+                    hwOpParams.withPad = true;
+                    hwOpParams.padMode = padType;
+                }
+
+                int bufInd = 0;
+                hwOpParams.inputInd = bufInd++;
+                hwOpParams.outputInd = bufInd++;
+
+                hwOpParams.outChanOffset = chanOffset;
+                hwOpParams.outNumChans = numChans;
+
+                hwOpParams.kernelWidth = kernelSizeX;
+                hwOpParams.kernelHeight = kernelSizeY;
+                hwOpParams.kernelStride = kernelStride;
+
+                if (withReLU) {
+                    hwOpParams.withReLU = true;
+                    hwOpParams.t0 = 0;
+                    hwOpParams.a0 = 0;
+                    hwOpParams.a1 = 1;
+                }
+
+                hwOps.vec.emplace_back(hwOpParams);
+
+                chanOffset += tiling.chansPerDescr;
+            }
+        } else if (opType == HwOpType::FC) {
+            auto input = stage->input(0);
+            auto biases = stage->input(2);
+            auto scales = stage->input(3);
+            auto output = stage->output(0);
+
+            auto tiling = stage->attrs().get<HwFullyConnectedTileInfo>("tiling");
+
+            auto withReLU = stage->attrs().get<bool>("withReLU");
+
+            IE_ASSERT(tiling.numOutTiles > 0);
+            IE_ASSERT(tiling.numInSubTiles > 0);
+
+            int outputOffset = 0;
+            for (int outTileIndex = 0; outTileIndex < tiling.numOutTiles; ++outTileIndex) {
+                int inputOffset = 0;
+                for (int subInTileIndex = 0; subInTileIndex < tiling.numInSubTiles; ++subInTileIndex) {
+                    auto lastSubTile = (subInTileIndex == tiling.numInSubTiles - 1);
+                    auto accum = !lastSubTile;
+
+                    HwOpParams hwOpParams;
+
+                    hwOpParams.opType = opType;
+                    hwOpParams.opMode = tiling.mode;
+
+                    int bufInd = 0;
+                    hwOpParams.inputInd = bufInd++;
+                    hwOpParams.coeffsInd = bufInd++;
+                    if (biases->usage() != DataUsage::Fake) {
+                        hwOpParams.biasesInd = bufInd++;
+                    }
+                    if (scales->usage() != DataUsage::Fake) {
+                        hwOpParams.scalesInd = bufInd++;
+                    }
+                    hwOpParams.outputInd = bufInd++;
+
+                    hwOpParams.fcInputOffset = inputOffset;
+                    hwOpParams.fcInputNum = tiling.workInN;
+                    hwOpParams.fcOutputOffset = outputOffset;
+                    hwOpParams.fcOutputNum = tiling.workOutN;
+                    hwOpParams.fcAccum = accum;
+
+                    if (lastSubTile && withReLU) {
+                        hwOpParams.withReLU = true;
+                        hwOpParams.t0 = 0;
+                        hwOpParams.a0 = 0;
+                        hwOpParams.a1 = 1;
+                    }
+
+                    auto lastOutC = std::min(output->desc().dim(Dim::C), outputOffset + tiling.workOutN) - 1;
+
+                    hwOps.vec.emplace_back(hwOpParams);
+
+                    inputOffset += tiling.workInN;
+                }
+
+                outputOffset += tiling.workOutN;
+            }
+        }
+
+        IE_ASSERT(!hwOps.vec.empty());
+
+        stage->attrs().set("hwOps", hwOps);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::finalizeHwOps() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/find_subgraphs.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/find_subgraphs.cpp
new file mode 100644 (file)
index 0000000..cc4be98
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <cmath>
+#include <list>
+#include <set>
+#include <unordered_map>
+#include <memory>
+
+#include <vpu/stub_stage.hpp>
+#include <vpu/sw/utility.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(findSubGraphs);
+
+    const auto& env = CompileEnv::get();
+    auto stages = model->getStages();
+    int maxClasses = 0;
+    int currentCount = 0;
+    for (const auto& stage : stages) {
+        if (currentCount >= env.config.numberOfNodesInOneSubGraph) {
+            currentCount = 0;
+            maxClasses++;
+        }
+        stage->setSubGraphNumber(maxClasses);
+        currentCount++;
+    }
+    model->setNumberOfSubGraphs(maxClasses + 1);
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::findSubGraphs() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/hw_conv_tiling.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/hw_conv_tiling.cpp
new file mode 100644 (file)
index 0000000..35d6fea
--- /dev/null
@@ -0,0 +1,1372 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <tuple>
+#include <utility>
+#include <memory>
+#include <list>
+#include <string>
+#include <limits>
+#include <algorithm>
+#include <vector>
+#include <unordered_map>
+#include <set>
+
+#include <precision_utils.h>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class Optimizer final {
+public:
+    Optimizer(const std::string& stageName,
+              const DimValues& inputDims, const DimValues& outputDims,
+              const DimValues& origOutputDims,
+              bool withPool,
+              int kernelSizeX, int kernelSizeY,
+              int kernelStride,
+              int paddingX, int paddingY)
+        : _stageName(stageName),
+          _inputDims(inputDims), _outputDims(outputDims),
+          _origOutputDims(origOutputDims),
+          _withPool(withPool),
+          _kernelSizeX(kernelSizeX), _kernelSizeY(kernelSizeY),
+          _kernelStride(kernelStride),
+          _paddingX(paddingX), _paddingY(paddingY) {
+    }
+
+    bool optimize() {
+        initTileSizes();
+
+        if (!selectBestTile()) {
+            if (_withPool) {
+                removePool();
+                return optimize();
+            }
+
+            return false;
+        }
+
+        patternMatching();
+
+        // Merged Pooling and SoC can't be used together.
+        if (_withPool) {
+            IE_ASSERT(!hasSoC());
+        }
+
+        if (!createTiles()) {
+            if (_withPool) {
+                removePool();
+                return optimize();
+            }
+
+            return false;
+        }
+
+        return true;
+    }
+
+    bool withPool() const {
+        return _withPool;
+    }
+
+    const HwConvTilingPtr& getTiling() const {
+        return _tiling;
+    }
+
+private:
+    void initTileSizes() {
+        int tempX = _inputDims[Dim::W] + 2 * _paddingX - _kernelSizeX;
+        int tempY = _inputDims[Dim::H] + 2 * _paddingY - _kernelSizeY;
+
+        int outWidthWithOutCeil = (tempX + _kernelStride) / _kernelStride;
+        int outHeightWithOutCeil = (tempY + _kernelStride) / _kernelStride;
+
+        int outWidthWithCeil =  static_cast<int>(std::ceil(static_cast<double>(tempX) / _kernelStride + 1));
+        int outHeightWithCeil = static_cast<int>(std::ceil(static_cast<double>(tempY) / _kernelStride + 1));
+
+        if ((_origOutputDims[Dim::W] != outWidthWithCeil) && (_origOutputDims[Dim::W] != outWidthWithOutCeil)) {
+            VPU_THROW_EXCEPTION
+                    << "Internal error: Output in " << _stageName << " has incorrect width dimension. Expected: "
+                    << outWidthWithCeil << " or " << outWidthWithOutCeil << " Actual: " << _origOutputDims[Dim::W];
+        }
+
+        if ((_origOutputDims[Dim::H] != outHeightWithCeil) && (_origOutputDims[Dim::H] != outHeightWithOutCeil)) {
+            VPU_THROW_EXCEPTION
+                    << "Internal error: Output in " << _stageName << " has incorrect height dimension. Expected: "
+                    << outHeightWithCeil << " or " << outHeightWithOutCeil << " Actual: " << _origOutputDims[Dim::H];
+        }
+
+        if ((_origOutputDims[Dim::W] == outWidthWithCeil) && (_origOutputDims[Dim::H] == outHeightWithCeil)) {
+            _useCeil = true;
+        } else {
+            IE_ASSERT((_origOutputDims[Dim::W] == outWidthWithOutCeil) && (_origOutputDims[Dim::H] == outHeightWithOutCeil));
+        }
+
+        _inputTileDims.set(Dim::W, std::min(CNN_MAX_INPUT_WIDTH, _inputDims[Dim::W]));
+        _inputTileDims.set(Dim::H, std::min(CNN_MAX_INPUT_HEIGHT, _inputDims[Dim::H]));
+        _inputTileDims.set(Dim::C, std::min(CNN_MAX_INPUT_CHANNELS, _inputDims[Dim::C]));
+
+        _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+        _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+        _outputTileDims.set(Dim::C, _outputDims[Dim::C]);
+
+        correctOutputPlaneSize();
+    }
+
+    void patternMatching() {
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 512 && _inputDims[Dim::H] == 28 && _inputDims[Dim::W] == 28 &&
+            _outputDims[Dim::C] == 512) {
+            _inputTileDims.set(Dim::H, 28);
+            _inputTileDims.set(Dim::C, 172);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 256 && _inputDims[Dim::H] == 56 && _inputDims[Dim::W] == 56 &&
+            _outputDims[Dim::C] == 256) {
+            _inputTileDims.set(Dim::H, 30);
+            _inputTileDims.set(Dim::C, 128);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 64 && _inputDims[Dim::H] == 224 && _inputDims[Dim::W] == 224 &&
+            _outputDims[Dim::C] == 64) {
+            _inputTileDims.set(Dim::H, 82);
+            _inputTileDims.set(Dim::W, 82);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (_inputDims[Dim::C] == 512 &&
+                _inputDims[Dim::H] == 7 &&
+                _inputDims[Dim::W] == 7 &&
+                _outputDims[Dim::C] == 4096) {
+            _inputTileDims.set(Dim::C, 64);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 128 && _inputDims[Dim::H] == 112 && _inputDims[Dim::W] == 112 &&
+            _outputDims[Dim::C] == 128) {
+            _inputTileDims.set(Dim::H, 32);
+            _inputTileDims.set(Dim::W, 112);
+            _inputTileDims.set(Dim::C, 32);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (_inputDims[Dim::C] == 1088 &&
+            _inputDims[Dim::H] == 17 &&
+            _inputDims[Dim::W] == 17 &&
+            (_outputDims[Dim::C] == 128 || _outputDims[Dim::C] == 192)) {
+            _inputTileDims.set(Dim::H, 17);
+            _inputTileDims.set(Dim::C, 544);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (_inputDims[Dim::C] == 1024 &&
+                _inputDims[Dim::H] == 17 &&
+                _inputDims[Dim::W] == 17 &&
+                _outputDims[Dim::C] == 384) {
+            _inputTileDims.set(Dim::H, 17);
+            _inputTileDims.set(Dim::C, 512);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 0 && _paddingY == 0 && _kernelStride == 2 &&
+            _inputDims[Dim::C] == 384 && _inputDims[Dim::H] == 35 && _inputDims[Dim::W] == 35 &&
+            _outputDims[Dim::C] == 384) {
+            _inputTileDims.set(Dim::C, 194);
+            _inputTileDims.set(Dim::H, 35);
+            _inputTileDims.set(Dim::W, 35);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (_inputDims[Dim::C] == 192 &&
+                _inputDims[Dim::H] == 71 &&
+                _inputDims[Dim::W] == 71 &&
+                _outputDims[Dim::H] == 35) {
+            _inputTileDims.set(Dim::W, 71);
+            _inputTileDims.set(Dim::C, 96);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+                _inputDims[Dim::C] == 256 &&
+                _inputDims[Dim::H] == 128 &&
+                _inputDims[Dim::W] == 128 &&
+                _outputDims[Dim::C] == 256) {
+            _inputTileDims.set(Dim::W, 128);
+            _inputTileDims.set(Dim::H, 15);
+            _inputTileDims.set(Dim::C, 64);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+                _inputDims[Dim::C] == 512 &&
+                _inputDims[Dim::H] == 64 &&
+                _inputDims[Dim::W] == 64 &&
+                _outputDims[Dim::C] == 512) {
+            _inputTileDims.set(Dim::W, 64);
+            _inputTileDims.set(Dim::H, 10);
+            _inputTileDims.set(Dim::C, 128);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 1 && _kernelSizeY == 1 && _paddingX == 0 && _paddingY == 0 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 384 &&
+            _inputDims[Dim::H] == 56 &&
+            _inputDims[Dim::W] == 56 &&
+            _outputDims[Dim::C] == 64) {
+            _inputTileDims.set(Dim::C, 384);
+            _inputTileDims.set(Dim::H, 56);
+            _inputTileDims.set(Dim::W, 20);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 1 && _kernelSizeY == 1 && _paddingX == 0 && _paddingY == 0 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 2112 &&
+            _inputDims[Dim::H] == 14 &&
+            _inputDims[Dim::W] == 14 &&
+            _outputDims[Dim::C] == 1056) {
+            _inputTileDims.set(Dim::C, 556);
+            _inputTileDims.set(Dim::H, 14);
+            _inputTileDims.set(Dim::W, 14);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 2 &&
+            _inputDims[Dim::C] == 256 &&
+            _inputDims[Dim::H] == 52 &&
+            _inputDims[Dim::W] == 52 &&
+            _outputDims[Dim::C] == 512) {
+            _inputTileDims.set(Dim::C, 128);
+            _inputTileDims.set(Dim::H, 52);
+            _inputTileDims.set(Dim::W, 52);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+
+        if (!_withPool &&
+            _kernelSizeX == 3 && _kernelSizeY == 3 && _paddingX == 1 && _paddingY == 1 && _kernelStride == 1 &&
+            _inputDims[Dim::C] == 256 &&
+            _inputDims[Dim::H] == 23 &&
+            _inputDims[Dim::W] == 23 &&
+            _outputDims[Dim::C] == 640) {
+            _inputTileDims.set(Dim::C, 256);
+            _inputTileDims.set(Dim::H, 14);
+            _inputTileDims.set(Dim::W, 23);
+            _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+            _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+            correctOutputPlaneSize();
+            return;
+        }
+    }
+
+    bool selectBestTile() {
+        struct Solution final {
+            int numWidthTiles = 0;
+            int numHeightTiles = 0;
+            int numChannelTiles = 0;
+            int totalNumTiles = 0;
+            double cost = std::numeric_limits<double>::max();
+        };
+
+        const auto& env = CompileEnv::get();
+
+        // TODO: estimate this numbers
+        const int maxNumWidthTiles = 15;
+        const int maxNumHeightTiles = 15;
+        const int maxNumChannelTiles = _withPool ? 1 : 15;
+
+        Solution bestSol;
+
+        auto outputTileCopy = _outputTileDims;
+
+        auto minInputTileDimW = 64;
+        auto minInputTileDimH = _kernelSizeY;
+        if (_withPool) {
+            minInputTileDimW *= 2;
+            minInputTileDimH *= 2;
+        }
+
+        for (int numChannelTiles = 1; numChannelTiles <= maxNumChannelTiles; numChannelTiles++) {
+            int inputTileDimC = divUp(_inputDims[Dim::C], numChannelTiles);
+
+            for (int numWidthTiles = 1; numWidthTiles <= maxNumWidthTiles; numWidthTiles++) {
+                int inputTileDimW = divUp(_inputDims[Dim::W], numWidthTiles);
+
+                //
+                // Filter-out too small SoW tiles.
+                //
+
+                if (numWidthTiles > 1 && inputTileDimW < minInputTileDimW) {
+                    break;
+                }
+
+                for (int numHeightTiles = 1; numHeightTiles <= maxNumHeightTiles; numHeightTiles++) {
+                    int inputTileDimH = divUp(_inputDims[Dim::H], numHeightTiles);
+
+                    //
+                    // Filter-out too small SoH tiles.
+                    //
+
+                    if (numHeightTiles > 1 && inputTileDimH < minInputTileDimH) {
+                        break;
+                    }
+
+                    //
+                    // Try current tile size.
+                    //
+
+                    _inputTileDims.set(Dim::W, inputTileDimW);
+                    _inputTileDims.set(Dim::H, inputTileDimH);
+                    _inputTileDims.set(Dim::C, inputTileDimC);
+
+                    _outputTileDims = outputTileCopy;
+                    correctOutputPlaneSize();
+
+                    //
+                    // Limitations for Conv+Pool case.
+                    //
+
+                    if (_withPool) {
+                        if (_outputTileDims[Dim::W] <= 2 ||
+                            _outputTileDims[Dim::H] <= 2) {
+                            break;
+                        }
+                    }
+
+                    //
+                    // Check that tiling is valid.
+                    //
+
+                    auto heightTiles = calcHeightTiles();
+                    auto widthTiles = calcWidthTiles();
+
+                    if (heightTiles.empty()) {
+                        continue;
+                    }
+                    if (widthTiles.empty()) {
+                        break;
+                    }
+
+                    bool isOK = true;
+                    double solutionCost = 0.0;
+
+                    for (const auto& heightTile : heightTiles) {
+                        for (const auto& widthTile : widthTiles) {
+                            //
+                            // Limitations for Conv+Pool case.
+                            //
+
+                            if (_withPool) {
+                                if (widthTile.inputWithJunk % 2 != 0 ||
+                                    heightTile.inputWithJunk % 2 != 0 ||
+                                    widthTile.outputWithJunk % 2 != 0 ||
+                                    widthTile.outputWithJunk <= 2 ||
+                                    heightTile.outputWithJunk <= 2) {
+                                    isOK = false;
+                                    break;
+                                }
+                            }
+
+                            //
+                            // Can use this tile.
+                            //
+
+                            auto tileInfo = splitHwConvIntoOutChannelsTiles(
+                                widthTile.inputWithJunk, heightTile.inputWithJunk, inputTileDimC,
+                                outputTileCopy[Dim::C],
+                                _kernelSizeX, _kernelSizeY, _kernelStride);
+
+                            if (tileInfo.numDescr == 0) {
+                                isOK = false;
+                                break;
+                            }
+
+                            //
+                            // Output tile fits to CMX limitation.
+                            //
+
+                            DimValues fullOutputTileDims;
+                            fullOutputTileDims.set(Dim::W, widthTile.outputWithJunk);
+                            fullOutputTileDims.set(Dim::H, heightTile.outputWithJunk);
+                            fullOutputTileDims.set(Dim::C, outputTileCopy[Dim::C]);
+
+                            // TODO: support HCW
+                            if (calculateHwBufferSize(fullOutputTileDims) > env.resources.cmxLimit) {
+                                isOK = false;
+                                break;
+                            }
+
+                            //
+                            // Calc tile cost.
+                            //
+
+                            solutionCost += tileInfo.cost * numChannelTiles;
+
+                            // Alignment for output
+                            if ((widthTile.outputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                                solutionCost += 1.0
+                                      * widthTile.outputWithJunk
+                                      * heightTile.outputWithJunk
+                                      * outputTileCopy[Dim::C];
+                            }
+
+                            // Alignment for input
+                            if ((widthTile.inputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                                solutionCost += 1.0
+                                      * widthTile.inputWithJunk
+                                      * heightTile.inputWithJunk
+                                      * tileInfo.extendedInputDimC;
+                            }
+
+                            // SoC overhead
+                            solutionCost += 1.0
+                                  * (numChannelTiles - 1)
+                                  * widthTile.outputWithJunk
+                                  * heightTile.outputWithJunk
+                                  * outputTileCopy[Dim::C];
+                        }
+
+                        if (!isOK) {
+                            break;
+                        }
+                    }
+
+                    if (!isOK) {
+                        continue;
+                    }
+
+                    //
+                    // Compare with current best solution.
+                    //
+
+                    Solution curSol;
+                    curSol.numWidthTiles = numWidthTiles;
+                    curSol.numHeightTiles = numHeightTiles;
+                    curSol.numChannelTiles = numChannelTiles;
+                    curSol.totalNumTiles = numWidthTiles * numHeightTiles * numChannelTiles;
+                    curSol.cost = solutionCost;
+
+                    if (curSol.cost < bestSol.cost || (isDoubleEqual(curSol.cost, bestSol.cost) && curSol.totalNumTiles < bestSol.totalNumTiles)) {
+                        bestSol = curSol;
+                    }
+
+                    // Skip smaller SoC tiling.
+                    break;
+                }
+            }
+        }
+
+        if (bestSol.totalNumTiles == 0) {
+            return false;
+        }
+
+        int inputTileDimW = divUp(_inputDims[Dim::W], bestSol.numWidthTiles);
+        int inputTileDimH = divUp(_inputDims[Dim::H], bestSol.numHeightTiles);
+        int inputTileDimC = divUp(_inputDims[Dim::C], bestSol.numChannelTiles);
+
+        _inputTileDims.set(Dim::W, inputTileDimW);
+        _inputTileDims.set(Dim::H, inputTileDimH);
+        _inputTileDims.set(Dim::C, inputTileDimC);
+
+        _outputTileDims = outputTileCopy;
+        correctOutputPlaneSize();
+
+        return true;
+    }
+
+    bool createTiles() {
+        auto heightTiles = calcHeightTiles();
+        IE_ASSERT(!heightTiles.empty());
+
+        auto widthTiles = calcWidthTiles();
+        IE_ASSERT(!widthTiles.empty());
+
+        _tiling = std::make_shared<HwConvTiling>();
+        _tiling->sohTiles = heightTiles.size();
+        _tiling->sowTiles = widthTiles.size();
+        _tiling->socTiles = divUp(_inputDims[Dim::C], _inputTileDims[Dim::C]);
+
+        for (int sohInd = 0; sohInd < _tiling->sohTiles; ++sohInd) {
+            const auto& heightTileInfo = heightTiles[sohInd];
+
+            for (int sowInd = 0; sowInd < _tiling->sowTiles; ++sowInd) {
+                const auto& widthTileInfo = widthTiles[sowInd];
+
+                auto planeTile = std::make_shared<HwConvPlaneTile>();
+                planeTile->parent = _tiling;
+
+                planeTile->sohInd = sohInd;
+                planeTile->sowInd = sowInd;
+
+                planeTile->heightInfo = heightTileInfo;
+                planeTile->widthInfo = widthTileInfo;
+
+                for (int socInd = 0; socInd < _tiling->socTiles; ++socInd) {
+                    auto channelTile = std::make_shared<HwConvChannelTile>();
+                    channelTile->parent = planeTile;
+
+                    channelTile->socInd = socInd;
+
+                    channelTile->finalTiles = splitHwConvIntoOutChannelsTiles(
+                            widthTileInfo.inputWithJunk, heightTileInfo.inputWithJunk, _inputTileDims[Dim::C],
+                            _outputTileDims[Dim::C],
+                            _kernelSizeX, _kernelSizeY, _kernelStride);
+
+                    if (channelTile->finalTiles.numDescr == 0) {
+                        return false;
+                    }
+
+                    channelTile->extendedInputDimC = channelTile->finalTiles.extendedInputDimC;
+                    channelTile->extendedOutputDimC = channelTile->finalTiles.extendedOutputDimC;
+
+                    channelTile->channelStartIndex = socInd * _inputTileDims[Dim::C];
+                    channelTile->numInputChannels = _inputTileDims[Dim::C];
+
+                    planeTile->channelTiles.emplace_back(channelTile);
+                }
+
+                _tiling->planeTiles.emplace_back(planeTile);
+            }
+        }
+
+        return true;
+    }
+
+private:
+    void correctOutputPlaneSize() {
+        int maxOutputWidth = calcOutputSize(_inputTileDims[Dim::W], _kernelSizeX, _kernelStride, _paddingX, _paddingX, _useCeil);
+        if (_withPool) {
+            maxOutputWidth /= 2;
+        }
+        _outputTileDims.set(Dim::W, std::min(_outputTileDims[Dim::W], maxOutputWidth));
+
+        int maxOutputHeight = calcOutputSize(_inputTileDims[Dim::H], _kernelSizeY, _kernelStride, _paddingY, _paddingY, _useCeil);
+        if (_withPool) {
+            maxOutputHeight /= 2;
+        }
+        _outputTileDims.set(Dim::H, std::min(_outputTileDims[Dim::H], maxOutputHeight));
+    }
+
+    bool hasSoC() const {
+        return _inputTileDims[Dim::C] != _inputDims[Dim::C];
+    }
+
+    void removePool() {
+        _withPool = false;
+        _outputDims = _origOutputDims;
+    }
+
+    std::vector<HwPlaneTileInfo> calcHeightTiles() {
+        std::vector<HwPlaneTileInfo> heightTiles;
+
+        if (_outputTileDims[Dim::H] == _outputDims[Dim::H]) {
+            HwPlaneTileInfo info;
+            info.inputWithJunk = _inputDims[Dim::H];
+            info.outputWithJunk = _outputDims[Dim::H];
+            info.outputJunkBefore = 0;
+            info.outputJunkAfter = 0;
+            info.inputStartIndex = 0;
+            info.inputEndIndex = _inputDims[Dim::H];
+            info.outputStartIndex = 0;
+            info.outputEndIndex = _outputDims[Dim::H];
+
+            heightTiles.emplace_back(info);
+        } else {
+            if (_withPool) {
+                heightTiles = splitIntoPlaneTilesWithPool(
+                    _inputDims[Dim::H],
+                    _kernelSizeY,
+                    _kernelStride,
+                    _paddingY,
+                    _outputTileDims[Dim::H]);
+            } else {
+                heightTiles = splitIntoPlaneTiles(
+                    _inputDims[Dim::H],
+                    _outputDims[Dim::H],
+                    _kernelSizeY,
+                    _kernelStride,
+                    _paddingY, _paddingY,
+                    _outputTileDims[Dim::H],
+                    false,
+                    _useCeil);
+            }
+        }
+
+        return heightTiles;
+    }
+
+    std::vector<HwPlaneTileInfo> calcWidthTiles() {
+        std::vector<HwPlaneTileInfo> widthTiles;
+
+        if (_outputTileDims[Dim::W] == _outputDims[Dim::W]) {
+            HwPlaneTileInfo info;
+            info.inputWithJunk = _inputDims[Dim::W];
+            info.outputWithJunk = _outputDims[Dim::W];
+            info.outputJunkBefore = 0;
+            info.outputJunkAfter = 0;
+            info.inputStartIndex = 0;
+            info.inputEndIndex = _inputDims[Dim::W];
+            info.outputStartIndex = 0;
+            info.outputEndIndex = _outputDims[Dim::W];
+
+            widthTiles.emplace_back(info);
+        } else {
+            if (_withPool) {
+                widthTiles = splitIntoPlaneTilesWithPool(
+                    _inputDims[Dim::W],
+                    _kernelSizeX,
+                    _kernelStride,
+                    _paddingX,
+                    _outputTileDims[Dim::W]);
+            } else {
+                widthTiles = splitIntoPlaneTiles(
+                    _inputDims[Dim::W],
+                    _outputDims[Dim::W],
+                    _kernelSizeX,
+                    _kernelStride,
+                    _paddingX, _paddingX,
+                    _outputTileDims[Dim::W],
+                    true,
+                    _useCeil);
+            }
+        }
+
+        return widthTiles;
+    }
+
+private:
+    std::string _stageName;
+
+    DimValues _inputDims;
+    DimValues _outputDims;
+    DimValues _origOutputDims;
+
+    bool _withPool = false;
+
+    int _kernelSizeX = 0;
+    int _kernelSizeY = 0;
+    int _kernelStride = 0;
+    int _paddingX = 0;
+    int _paddingY = 0;
+
+    DimValues _inputTileDims;
+    DimValues _outputTileDims;
+
+    HwConvTilingPtr _tiling;
+
+    bool _useCeil = false;
+};
+
+using TileWeightsMap = std::unordered_map<int, Data>;
+
+const int BIASES_IND = -1;
+const int SCALES_IND = -2;
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(hwConvTiling);
+
+    for (const auto& origStage : model->getStages()) {
+        if (origStage->type() != StageType::StubConv) {
+            continue;
+        }
+
+        auto tryHW = origStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        auto origInput = origStage->input(0);
+        auto origWeights = origStage->input(1);
+        auto origBiases = origStage->input(2);
+        auto origOutput = origStage->output(0);
+
+        auto kernelSizeX = origStage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = origStage->attrs().get<int>("kernelSizeY");
+        auto kernelStride = origStage->attrs().get<int>("kernelStrideX");
+        auto padLeft = origStage->attrs().get<int>("padLeft");
+        auto padRight = origStage->attrs().get<int>("padRight");
+        auto padTop = origStage->attrs().get<int>("padTop");
+        auto padBottom = origStage->attrs().get<int>("padBottom");
+
+        auto withReLU = origStage->attrs().getOrDefault<bool>("withReLU", false);
+        auto negativeSlope = origStage->attrs().getOrDefault<float>("negativeSlope", 0.0f);
+        auto a0 = origStage->attrs().getOrDefault<uint32_t>("a0", 0);
+        auto a1 = origStage->attrs().getOrDefault<uint32_t>("a1", 0);
+        auto reluScale = origStage->attrs().getOrDefault<float>("reluScale", 1.0f);
+
+        auto withClamp = origStage->attrs().getOrDefault<bool>("withClamp", false);
+        auto clampMax =  origStage->attrs().getOrDefault<float>("clampMax", 6.0);
+
+        auto withPool = origStage->attrs().getOrDefault<bool>("withPool", false);
+        auto poolKernelSizeX = origStage->attrs().getOrDefault<int>("poolKernelSizeX", 0);
+        auto poolKernelSizeY = origStage->attrs().getOrDefault<int>("poolKernelSizeY", 0);
+        auto poolKernelStride = origStage->attrs().getOrDefault<int>("poolKernelStride", 0);
+        auto poolPadLeft = origStage->attrs().getOrDefault<int>("poolPadLeft", 0);
+        auto poolPadRight = origStage->attrs().getOrDefault<int>("poolPadRight", 0);
+        auto poolPadTop = origStage->attrs().getOrDefault<int>("poolPadTop", 0);
+        auto poolPadBottom = origStage->attrs().getOrDefault<int>("poolPadBottom", 0);
+
+        auto origOutputDesc = origStage->attrs().getOrDefault<DataDesc>("origConvOutput", origOutput->desc());
+
+        auto scaleFactor = origStage->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+
+        auto& tileWeightsMap = origWeights->attrs().getOrSet<TileWeightsMap>("weightsPerTile", TileWeightsMap());
+
+        //
+        // Unsupported paddings
+        //
+
+        auto hwInput = origInput;
+        auto hwOutput = origOutput;
+
+        //
+        // Try to find "best" tiling
+        //
+
+        Optimizer opt(origStage->name(),
+                      hwInput->desc().dims(), hwOutput->desc().dims(),
+                      origOutputDesc.dims(),
+                      withPool,
+                      kernelSizeX, kernelSizeY,
+                      kernelStride,
+                      padLeft, padTop);
+
+        //
+        // Use SW stage if tiling optimization failed
+        //
+
+        if (!opt.optimize()) {
+            origStage->attrs().set<bool>("tryHW", false);
+
+            auto swConvOutput = origOutput;
+            if (withReLU || withPool || withClamp) {
+                swConvOutput = model->addNewData(
+                    origStage->name(),
+                    origOutputDesc);
+                swConvOutput->attrs().copyFrom(origOutput->attrs());
+
+                model->replaceStageOutput(origStage->outputEdge(0), swConvOutput);
+            }
+
+            auto hwPoolInput = swConvOutput;
+            if (withReLU) {
+                auto swReluOutput = origOutput;
+                if (withPool) {
+                    swReluOutput = model->addNewData(
+                        origStage->name() + "@ReLU",
+                        origOutputDesc);
+                    swReluOutput->attrs().copyFrom(origOutput->attrs());
+                }
+
+                _stageBuilder->addReLUStage(
+                    model,
+                    origStage->name() + "@ReLU",
+                    origStage->origLayer(),
+                    negativeSlope,
+                    swConvOutput,
+                    swReluOutput);
+
+                hwPoolInput = swReluOutput;
+            }
+
+            if (withClamp) {
+                auto swClampOutput = origOutput;
+                if (withPool) {
+                    swClampOutput = model->addNewData(
+                            origStage->name() + "@Clamp",
+                            origOutputDesc);
+                    swClampOutput->attrs().copyFrom(origOutput->attrs());
+                }
+
+                _stageBuilder->addClampStage(
+                        model,
+                        origStage->name() + "@Clamp",
+                        origStage->origLayer(),
+                        0.0,
+                        clampMax,
+                        swConvOutput,
+                        swClampOutput);
+
+                hwPoolInput = swClampOutput;
+            }
+
+            if (withPool) {
+                auto hwPoolStage = model->addNewStage<StubStage>(
+                    origStage->name() + "@Pool",
+                    StageType::StubMaxPool,
+                    origStage->origLayer(),
+                    {hwPoolInput},
+                    {origOutput});
+
+                hwPoolStage->attrs().set<int>("kernelSizeX", poolKernelSizeX);
+                hwPoolStage->attrs().set<int>("kernelSizeY", poolKernelSizeY);
+
+                hwPoolStage->attrs().set<int>("kernelStrideX", poolKernelStride);
+                hwPoolStage->attrs().set<int>("kernelStrideY", poolKernelStride);
+
+                hwPoolStage->attrs().set<int>("padLeft", poolPadLeft);
+                hwPoolStage->attrs().set<int>("padRight", poolPadRight);
+                hwPoolStage->attrs().set<int>("padTop", poolPadTop);
+                hwPoolStage->attrs().set<int>("padBottom", poolPadBottom);
+
+                hwPoolStage->attrs().set<bool>("excludePad", false);
+
+                hwPoolStage->attrs().set<bool>("tryHW", true);
+            }
+
+            continue;
+        }
+
+        //
+        // Remove merged pool if we failed to optimize tiling with it
+        //
+
+        model->disconnectStageDatas(origStage);
+
+        if (withPool && !opt.withPool()) {
+            auto hwPoolInput = model->addNewData(
+                origStage->name(),
+                origOutputDesc);
+            hwPoolInput->attrs().copyFrom(origOutput->attrs());
+
+            auto hwPoolStage = model->addNewStage<StubStage>(
+                origStage->name() + "@Pool",
+                StageType::StubMaxPool,
+                origStage->origLayer(),
+                {hwPoolInput},
+                {hwOutput});
+
+            hwPoolStage->attrs().set<int>("kernelSizeX", poolKernelSizeX);
+            hwPoolStage->attrs().set<int>("kernelSizeY", poolKernelSizeY);
+
+            hwPoolStage->attrs().set<int>("kernelStrideX", poolKernelStride);
+            hwPoolStage->attrs().set<int>("kernelStrideY", poolKernelStride);
+
+            hwPoolStage->attrs().set<int>("padLeft", poolPadLeft);
+            hwPoolStage->attrs().set<int>("padRight", poolPadRight);
+            hwPoolStage->attrs().set<int>("padTop", poolPadTop);
+            hwPoolStage->attrs().set<int>("padBottom", poolPadBottom);
+
+            hwPoolStage->attrs().set<bool>("excludePad", false);
+
+            hwPoolStage->attrs().set<bool>("tryHW", true);
+
+            hwOutput = hwPoolInput;
+
+            withPool = false;
+        }
+
+        //
+        // Expand input/output if needed
+        //
+
+        const auto& tiling = opt.getTiling();
+
+        int totalExtendedInputDimC = 0;
+        int maxExtendedOutputDimC = 0;
+        for (const auto& planeTile : tiling->planeTiles) {
+            for (const auto& channelTile : planeTile->channelTiles) {
+                totalExtendedInputDimC = std::max(totalExtendedInputDimC, channelTile->channelStartIndex + channelTile->extendedInputDimC);
+                maxExtendedOutputDimC = std::max(maxExtendedOutputDimC, channelTile->extendedOutputDimC);
+            }
+        }
+
+        auto origOutputDimC = hwOutput->desc().dim(Dim::C);
+
+        if (totalExtendedInputDimC > hwInput->desc().dim(Dim::C)) {
+            auto newDesc = hwInput->desc();
+            newDesc.setDim(Dim::C, totalExtendedInputDimC);
+
+            auto hwInputExtended = model->duplicateData(
+                hwInput,
+                "@extended",
+                newDesc);
+
+            _stageBuilder->addExpandStage(
+                model,
+                origStage->name() + "@expand-input",
+                origStage->origLayer(),
+                hwInput,
+                hwInputExtended);
+
+            hwInput = hwInputExtended;
+        }
+
+        //
+        // Create HW biases
+        //
+
+        auto hwBiases = tileWeightsMap[BIASES_IND];
+        if (hwBiases == nullptr) {
+            if (origBiases->usage() == DataUsage::Fake) {
+                hwBiases = model->addFakeData();
+            } else {
+                auto origBiasesContent = origBiases->content();
+                IE_ASSERT(origBiasesContent != nullptr);
+
+                auto origBiasesPtr = origBiasesContent->get<fp16_t>();
+                IE_ASSERT(origBiasesPtr != nullptr);
+
+                auto hwTileBiasesBlob = ie::make_shared_blob<fp16_t>(
+                    ie::Precision::FP16,
+                    ie::Layout::C,
+                    {static_cast<size_t>(maxExtendedOutputDimC)});
+                hwTileBiasesBlob->allocate();
+
+                auto hwTileBiasesBlobPtr = hwTileBiasesBlob->buffer().as<fp16_t*>();
+                IE_ASSERT(hwTileBiasesBlobPtr != nullptr);
+
+                std::fill_n(hwTileBiasesBlobPtr, maxExtendedOutputDimC, ie::PrecisionUtils::f32tof16(0.0f));
+                std::copy_n(origBiasesPtr, origOutputDimC, hwTileBiasesBlobPtr);
+
+                hwBiases = model->duplicateData(
+                    origBiases,
+                    "@HW",
+                    DataDesc({maxExtendedOutputDimC}),
+                    ieBlobContent(hwTileBiasesBlob));
+
+                if (scaleFactor != 1.0f) {
+                    auto hwBiasesScaled = model->duplicateData(
+                        hwBiases,
+                        formatString("@SCALE=%f", scaleFactor),
+                        hwBiases->desc(),
+                        scaleContent(hwBiases->content(), scaleFactor));
+                    hwBiasesScaled->attrs().getOrSet<float>("scaleFactor", 1.0f) *= scaleFactor;
+
+                    hwBiases = hwBiasesScaled;
+                }
+            }
+
+            tileWeightsMap[BIASES_IND] = hwBiases;
+        }
+
+        //
+        // Create HW scales
+        //
+
+        auto hwScales = tileWeightsMap[SCALES_IND];
+        if (hwScales == nullptr) {
+            float fullScale = 1.0f / scaleFactor;
+            if (tiling->socTiles == 1 && reluScale != 1.0f) {
+                fullScale *= reluScale;
+            }
+
+            if (fullScale == 1.0f) {
+                hwScales = model->addFakeData();
+            } else {
+                hwScales = model->addConstData(
+                    origStage->name() + "@scales",
+                    DataDesc({maxExtendedOutputDimC}),
+                    replicateContent(fullScale, maxExtendedOutputDimC));
+            }
+
+            tileWeightsMap[SCALES_IND] = hwScales;
+        }
+
+        //
+        // Create HW tiles
+        //
+
+        DataVector hwInputTiles;
+        std::vector<DimValues> hwInputTilesOffsets;
+
+        DataVector hwOutputTiles;
+        std::vector<DimValues> hwOutputTilesOffsets;
+
+        for (const auto& planeTile : tiling->planeTiles) {
+            auto planeTilePostfix = getPlaneTilePostfix(planeTile);
+
+            //
+            // Create output tile
+            //
+
+            Data hwOutputPlaneTile;
+
+            if (tiling->sohTiles == 1 && tiling->sowTiles == 1) {
+                hwOutputPlaneTile = hwOutput;
+            } else {
+                auto newDesc = hwOutput->desc();
+                newDesc.setDim(Dim::W, planeTile->widthInfo.outputEndIndex - planeTile->widthInfo.outputStartIndex);
+                newDesc.setDim(Dim::H, planeTile->heightInfo.outputEndIndex - planeTile->heightInfo.outputStartIndex);
+
+                hwOutputPlaneTile = model->duplicateData(
+                    hwOutput,
+                    planeTilePostfix,
+                    newDesc);
+
+                hwOutputTiles.emplace_back(hwOutputPlaneTile);
+                hwOutputTilesOffsets.emplace_back(
+                    DimValues({
+                        {Dim::W, planeTile->widthInfo.outputStartIndex},
+                        {Dim::H, planeTile->heightInfo.outputStartIndex}
+                    }));
+            }
+
+            //
+            // Add alignment to output tile if needed
+            //
+
+            if ((planeTile->widthInfo.outputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                auto hwOutputPlaneTileAligned = model->duplicateData(
+                    hwOutputPlaneTile,
+                    "@aligned");
+
+                _stageBuilder->addCopyStage(
+                    model,
+                    origStage->name() + planeTilePostfix + "@align-output-ptr",
+                    origStage->origLayer(),
+                    hwOutputPlaneTileAligned,
+                    hwOutputPlaneTile);
+
+                hwOutputPlaneTile = hwOutputPlaneTileAligned;
+            }
+
+            Data prevPartialSum;
+
+            for (const auto& channelTile : planeTile->channelTiles) {
+                auto channelTilePostfix = getChannelTilePostfix(channelTile);
+
+                auto tilePostfix = planeTilePostfix + channelTilePostfix;
+
+                auto hwOutputTile = hwOutputPlaneTile;
+
+                //
+                // Create input tile
+                //
+
+                Data hwInputTile;
+
+                if (tiling->sohTiles == 1 && tiling->sowTiles == 1 && tiling->socTiles == 1) {
+                    hwInputTile = hwInput;
+                } else {
+                    auto newDesc = hwInput->desc();
+                    newDesc.setDim(Dim::W, planeTile->widthInfo.inputWithJunk);
+                    newDesc.setDim(Dim::H, planeTile->heightInfo.inputWithJunk);
+                    newDesc.setDim(Dim::C, channelTile->extendedInputDimC);
+
+                    hwInputTile = model->duplicateData(
+                        hwInput,
+                        tilePostfix,
+                        newDesc);
+
+                    hwInputTiles.emplace_back(hwInputTile);
+                    hwInputTilesOffsets.emplace_back(
+                        DimValues({
+                            {Dim::W, planeTile->widthInfo.inputStartIndex},
+                            {Dim::H, planeTile->heightInfo.inputStartIndex},
+                            {Dim::C, channelTile->channelStartIndex}
+                        }));
+                }
+
+                //
+                // Add alignment to input tile if needed
+                //
+
+                if ((planeTile->widthInfo.inputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                    auto hwInputTileAligned = model->duplicateData(
+                        hwInputTile,
+                        "@aligned");
+
+                    _stageBuilder->addCopyStage(
+                        model,
+                        origStage->name() + tilePostfix + "@align-input-ptr",
+                        origStage->origLayer(),
+                        hwInputTile,
+                        hwInputTileAligned);
+
+                    hwInputTile = hwInputTileAligned;
+                }
+
+                //
+                // Process partial output for split-over-channels
+                //
+
+                if (tiling->socTiles > 1) {
+                    auto hwConvPartialOutput = model->duplicateData(
+                        hwOutputTile,
+                        channelTilePostfix + "@partial");
+
+                    if (channelTile->socInd == 0) {
+                        prevPartialSum = hwConvPartialOutput;
+                    } else {
+                        auto sumPartialOutput = hwOutputTile;
+                        if (channelTile->socInd < tiling->socTiles - 1 || withReLU || withClamp) {
+                            sumPartialOutput = model->duplicateData(
+                                hwOutputTile,
+                                channelTilePostfix + "@accum");
+                        }
+
+                        _stageBuilder->addSumStage(
+                            model,
+                            origStage->name() + tilePostfix + "@accum",
+                            origStage->origLayer(),
+                            prevPartialSum, hwConvPartialOutput,
+                            sumPartialOutput);
+
+                        if (channelTile->socInd == tiling->socTiles - 1 && withReLU) {
+                            _stageBuilder->addReLUStage(
+                                model,
+                                origStage->name() + tilePostfix + "@ReLU",
+                                origStage->origLayer(),
+                                negativeSlope,
+                                sumPartialOutput,
+                                hwOutputTile);
+                        }
+
+                        if (channelTile->socInd == tiling->socTiles - 1 && withClamp) {
+                            _stageBuilder->addClampStage(
+                                    model,
+                                    origStage->name() + tilePostfix + "@Clamp",
+                                    origStage->origLayer(),
+                                    0.0,
+                                    clampMax,
+                                    sumPartialOutput,
+                                    hwOutputTile);
+                        }
+
+                        prevPartialSum = sumPartialOutput;
+                    }
+
+                    hwOutputTile = hwConvPartialOutput;
+                }
+
+                //
+                // Process output junk if needed
+                //
+
+                if (planeTile->heightInfo.outputJunkBefore != 0 ||
+                    planeTile->heightInfo.outputJunkAfter != 0 ||
+                    planeTile->widthInfo.outputJunkBefore != 0 ||
+                    planeTile->widthInfo.outputJunkAfter != 0) {
+                    auto newDesc = hwOutputTile->desc();
+                    newDesc.setDim(Dim::W, planeTile->widthInfo.outputWithJunk);
+                    newDesc.setDim(Dim::H, planeTile->heightInfo.outputWithJunk);
+
+                    auto hwOutputTileWithJunk = model->duplicateData(
+                        hwOutputTile,
+                        "@with-junk",
+                        newDesc);
+
+                    DimValues innerOffset;
+                    innerOffset.set(Dim::W, planeTile->widthInfo.outputJunkBefore);
+                    innerOffset.set(Dim::H, planeTile->heightInfo.outputJunkBefore);
+
+                    _stageBuilder->addShrinkStage(
+                        model,
+                        origStage->name() + tilePostfix + "@remove-junk",
+                        origStage->origLayer(),
+                        hwOutputTileWithJunk,
+                        hwOutputTile,
+                        innerOffset);
+
+                    hwOutputTile = hwOutputTileWithJunk;
+                }
+
+                //
+                // Create tile weights
+                //
+
+                auto hwTileWeights = tileWeightsMap[channelTile->socInd];
+
+                if (hwTileWeights == nullptr) {
+                    hwTileWeights = model->duplicateData(
+                        origWeights,
+                        "@HW" + channelTilePostfix,
+                        DataDesc({8, kernelSizeX * kernelSizeY, channelTile->extendedInputDimC, channelTile->extendedOutputDimC / 8}),
+                        std::make_shared<HwWeightsContent>(
+                            origWeights->content(),
+                            origWeights->desc(),
+                            channelTile->numInputChannels,
+                            channelTile->channelStartIndex));
+
+                    if (scaleFactor != 1.0f) {
+                        auto hwTileWeightsScaled = model->duplicateData(
+                            hwTileWeights,
+                            formatString("@SCALE=%f", scaleFactor),
+                            hwTileWeights->desc(),
+                            scaleContent(hwTileWeights->content(), scaleFactor));
+                        hwTileWeightsScaled->attrs().getOrSet<float>("scaleFactor", 1.0f) *= scaleFactor;
+
+                        hwTileWeights = hwTileWeightsScaled;
+                    }
+
+                    tileWeightsMap[channelTile->socInd] = hwTileWeights;
+                }
+
+                //
+                // Create tile biases
+                //
+
+                Data hwTileBiases;
+
+                if (channelTile->socInd > 0) {
+                    hwTileBiases = model->addFakeData();
+                } else {
+                    hwTileBiases = hwBiases;
+                }
+
+                //
+                // Create HW stage for tile
+                //
+
+                auto hwOutputTileDims = hwOutputTile->desc().dims();
+                if (withPool) {
+                    hwOutputTileDims.set(Dim::W, hwOutputTileDims[Dim::W] * poolKernelStride - poolPadLeft - poolPadRight);
+                    hwOutputTileDims.set(Dim::H, hwOutputTileDims[Dim::H] * poolKernelStride - poolPadTop - poolPadBottom);
+                }
+
+                auto hwPad = getHwPaddingInfo(
+                    hwInputTile->desc().dims(), hwOutputTileDims,
+                    kernelSizeX, kernelSizeY,
+                    kernelStride, kernelStride);
+
+                auto hwStage = model->addNewStage<MyriadXHwStage>(
+                    origStage->name() + tilePostfix,
+                    StageType::MyriadXHwOp,
+                    origStage->origLayer(),
+                    {hwInputTile, hwTileWeights, hwTileBiases, hwScales},
+                    {hwOutputTile});
+
+                hwStage->attrs().set<HwOpType>("hwOpType", withPool ? HwOpType::CONV_POOL : HwOpType::CONV);
+
+                hwStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+                hwStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+                hwStage->attrs().set<int>("kernelStride", kernelStride);
+
+                if (withPool) {
+                    hwStage->attrs().set<int>("poolKernelSizeX", poolKernelSizeX);
+                    hwStage->attrs().set<int>("poolKernelSizeY", poolKernelSizeY);
+                }
+
+                hwStage->attrs().set<HwPaddingInfo>("pad", hwPad);
+
+                hwStage->attrs().set<HwConvTileInfo>("tiling", channelTile->finalTiles);
+
+                if (tiling->socTiles > 1) {
+                    hwStage->attrs().set<bool>("withReLU", false);
+                    hwStage->attrs().set<bool>("withClamp", false);
+                } else {
+                    hwStage->attrs().set<bool>("withReLU", withReLU);
+                    hwStage->attrs().set<uint32_t>("a0", a0);
+                    hwStage->attrs().set<uint32_t>("a1", a1);
+                    hwStage->attrs().set<float>("negativeSlope", negativeSlope);
+
+                    hwStage->attrs().set<bool>("withClamp", withClamp);
+                    hwStage->attrs().set<float>("clampMax", clampMax);
+                }
+
+                hwStage->attrs().set<float>("scaleFactor", scaleFactor);
+            }
+        }
+
+        //
+        // Split/concat input/output tiles
+        //
+
+        if (!hwInputTiles.empty()) {
+            _stageBuilder->addSplitStage(
+                model,
+                origStage->name() + "@split-input",
+                origStage->origLayer(),
+                hwInputTilesOffsets,
+                hwInput,
+                hwInputTiles);
+        }
+
+        if (!hwOutputTiles.empty()) {
+            _stageBuilder->addConcatStage(
+                model,
+                origStage->name() + "@concat-output",
+                origStage->origLayer(),
+                hwOutputTilesOffsets,
+                hwOutputTiles,
+                hwOutput);
+        }
+
+        //
+        // Remove original stage
+        //
+
+        model->removeStage(origStage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::hwConvTiling() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/hw_fc_tiling.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/hw_fc_tiling.cpp
new file mode 100644 (file)
index 0000000..cc68d6c
--- /dev/null
@@ -0,0 +1,428 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <tuple>
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <list>
+#include <string>
+#include <memory>
+#include <utility>
+#include <set>
+#include <array>
+
+#include <precision_utils.h>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+std::tuple<int, int, HwFullyConnectedTileInfo> splitFullyConnected(
+        int inN, int outN,
+        const std::vector<HwOpMode>& modes = {HwOpMode::MODE_1_256, HwOpMode::MODE_2_128, HwOpMode::MODE_4_64, HwOpMode::MODE_8_32, HwOpMode::MODE_16_16}) {
+    struct Solution final {
+        HwOpMode mode = HwOpMode::MODE_1_256;
+        int newInN = 0;
+        int newOutN = 0;
+        int workInN = 0;
+        int workOutN = 0;
+        int countIn = 0;
+        int countOut = 0;
+        int cost = std::numeric_limits<int>::max();
+    };
+
+    Solution bestSol;
+
+    for (auto mode : modes) {
+        auto ramBlocks = 1 << static_cast<int>(mode);
+        auto maxInN = ramBlocks * 256;
+        auto maxOutN = 256 / ramBlocks;
+        auto newInN = alignVal(inN, ramBlocks);
+        auto newOutN = alignVal(outN, 8);
+        auto workInN = std::min(newInN, maxInN);
+        auto workOutN = std::min(newOutN, maxOutN);
+
+        if (workInN < ramBlocks) {
+            continue;
+        }
+
+        auto countIn = static_cast<int>(std::ceil(static_cast<double>(newInN) / workInN));
+        auto countOut = static_cast<int>(std::ceil(static_cast<double>(newOutN) / workOutN));
+        auto cost = countIn * countOut * (workInN / ramBlocks + CNN_MODES_COST[static_cast<int>(mode)]);
+
+        Solution curSol;
+        curSol.mode = mode;
+        curSol.newInN = newInN;
+        curSol.newOutN = newOutN;
+        curSol.workInN = workInN;
+        curSol.workOutN = workOutN;
+        curSol.countIn = countIn;
+        curSol.countOut = countOut;
+        curSol.cost = cost;
+
+        if (curSol.cost < bestSol.cost ||
+            (curSol.cost == bestSol.cost && curSol.countIn < bestSol.countIn) ||
+            (curSol.cost == bestSol.cost && curSol.countIn == bestSol.countIn && curSol.countOut < bestSol.countOut)) {
+            bestSol = curSol;
+        }
+    }
+
+    if (bestSol.countOut == 0) {
+        return std::make_tuple(0, 0, HwFullyConnectedTileInfo());
+    }
+
+    HwFullyConnectedTileInfo tiles;
+    tiles.mode = bestSol.mode;
+    tiles.numOutTiles = bestSol.countOut;
+    tiles.numInSubTiles = bestSol.countIn;
+    tiles.workInN = bestSol.workInN;
+    tiles.workOutN = bestSol.workOutN;
+
+    return std::make_tuple(std::max(bestSol.newInN, bestSol.countIn * bestSol.workInN), std::max(bestSol.newOutN, bestSol.countOut * bestSol.workOutN), tiles);
+}
+
+class HwFcRelayoutStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<HwFcRelayoutStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input] = input->desc().dimsOrder().createMovedDim(Dim::C, 2);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 2);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[output] = StridesRequirement().add(1, DimStride::Aligned);
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::TwoOrOne;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(hwFullyConnectedTiling);
+
+    const auto& env = CompileEnv::get();
+
+    for (const auto& origStage : model->getStages()) {
+        if (origStage->type() != StageType::StubFullyConnected) {
+            continue;
+        }
+
+        auto tryHW = origStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        auto origInput = origStage->input(0);
+        auto origWeights = origStage->input(1);
+        auto origBiases = origStage->input(2);
+        auto origOutput = origStage->output(0);
+
+        auto withReLU = origStage->attrs().getOrDefault<bool>("withReLU", false);
+
+        auto scaleFactor = origStage->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+
+        //
+        // Repack input data if needed
+        //
+
+        auto hwInput = origInput;
+        auto hwOutput = origOutput;
+
+        Stage relayoutStage;
+
+        if (hwInput->desc().numDims() > 2 &&
+            (hwInput->desc().dim(Dim::W) != 1 || hwInput->desc().dim(Dim::H) != 1)) {
+            auto newDesc = hwInput->desc();
+            newDesc.setDim(Dim::W, 1);
+            newDesc.setDim(Dim::H, 1);
+            newDesc.setDim(Dim::C, hwInput->desc().totalDimSize());
+
+            auto hwInputAsVec = model->duplicateData(
+                hwInput,
+                "@asVec",
+                newDesc);
+
+            relayoutStage = model->addNewStage<HwFcRelayoutStage>(
+                origStage->name() + "@input-relayout",
+                StageType::HwFcRelayout,
+                origStage->origLayer(),
+                {hwInput},
+                {hwInputAsVec});
+
+            hwInput = hwInputAsVec;
+        }
+
+        //
+        // Try to find "best" tiling
+        //
+
+        //
+        // Always use MODE_1_256
+        //
+
+        int extendedInputDimC = 0, extendedOutputDimC = 0;
+        HwFullyConnectedTileInfo tiles;
+        std::tie(extendedInputDimC, extendedOutputDimC, tiles) =
+            splitFullyConnected(
+                hwInput->desc().dim(Dim::C),
+                hwOutput->desc().dim(Dim::C),
+                {HwOpMode::MODE_1_256});
+
+        //
+        // Use SW stage if tiling optimization failed
+        //
+
+        if (tiles.numOutTiles == 0 ||
+            calculateHwBufferSize(hwOutput->desc().dims()) > env.resources.cmxLimit) {
+            origStage->attrs().set<bool>("tryHW", false);
+
+            if (relayoutStage != nullptr) {
+                model->removeStage(relayoutStage);
+            }
+
+            auto swOutput = origOutput;
+            if (withReLU) {
+                swOutput = model->addNewData(
+                    origStage->name(),
+                    origOutput->desc());
+                swOutput->attrs().copyFrom(origOutput->attrs());
+
+                model->replaceStageOutput(origStage->outputEdge(0), swOutput);
+
+                _stageBuilder->addReLUStage(
+                    model,
+                    origStage->name() + "@ReLU",
+                    origStage->origLayer(),
+                    0.0,
+                    swOutput,
+                    origOutput);
+            }
+
+            continue;
+        }
+
+        model->disconnectStageDatas(origStage);
+
+        //
+        // Expand input/output if needed
+        //
+
+        auto origInputDimC = hwInput->desc().dim(Dim::C);
+        auto origOutputDimC = hwOutput->desc().dim(Dim::C);
+
+        if (extendedInputDimC > origInputDimC) {
+            auto newDesc = hwInput->desc();
+            newDesc.setDim(Dim::C, extendedInputDimC);
+
+            auto hwInputExtended = model->duplicateData(
+                hwInput,
+                "@extended",
+                newDesc);
+
+            _stageBuilder->addExpandStage(
+                model,
+                origStage->name() + "@expand-input",
+                origStage->origLayer(),
+                hwInput,
+                hwInputExtended);
+
+            hwInput = hwInputExtended;
+        }
+
+        auto hwWeights = origWeights->attrs().getOrDefault<Data>("hwWeights", nullptr);
+        if (hwWeights == nullptr) {
+            hwWeights = model->duplicateData(
+                origWeights,
+                "@HW",
+                DataDesc({8, 1, extendedInputDimC, extendedOutputDimC / 8}),
+                std::make_shared<HwWeightsContent>(
+                    origWeights->content(),
+                    DataDesc({1, 1, origInputDimC, origOutputDimC}),
+                    hwInput->desc().dim(Dim::C)));
+
+            if (scaleFactor != 1.0f) {
+                auto hwWeightsScaled = model->duplicateData(
+                    hwWeights,
+                    formatString("@SCALE=%f", scaleFactor),
+                    hwWeights->desc(),
+                    scaleContent(hwWeights->content(), scaleFactor));
+                hwWeightsScaled->attrs().getOrSet<float>("scaleFactor", 1.0f) *= scaleFactor;
+
+                hwWeights = hwWeightsScaled;
+            }
+
+            origWeights->attrs().set<Data>("hwWeights", hwWeights);
+        }
+
+        auto hwBiases = origWeights->attrs().getOrDefault<Data>("hwBiases", nullptr);
+        if (hwBiases == nullptr) {
+            if (origBiases->usage() == DataUsage::Fake) {
+                hwBiases = model->addFakeData();
+            } else {
+                auto origBiasesContent = origBiases->content();
+                IE_ASSERT(origBiasesContent != nullptr);
+
+                auto origBiasesPtr = origBiasesContent->get<fp16_t>();
+                IE_ASSERT(origBiasesPtr != nullptr);
+
+                auto hwBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(extendedOutputDimC)});
+                hwBiasesBlob->allocate();
+
+                auto hwBiasesBlobPtr = hwBiasesBlob->buffer().as<fp16_t*>();
+                IE_ASSERT(hwBiasesBlobPtr != nullptr);
+
+                std::fill_n(hwBiasesBlobPtr, extendedOutputDimC, ie::PrecisionUtils::f32tof16(0.0f));
+                std::copy_n(origBiasesPtr, origOutputDimC, hwBiasesBlobPtr);
+
+                hwBiases = model->duplicateData(
+                    origBiases,
+                    "@HW",
+                    DataDesc({extendedOutputDimC}),
+                    ieBlobContent(hwBiasesBlob));
+
+                if (scaleFactor != 1.0f) {
+                    auto hwBiasesScaled = model->duplicateData(
+                        hwBiases,
+                        formatString("@SCALE=%f", scaleFactor),
+                        hwBiases->desc(),
+                        scaleContent(hwBiases->content(), scaleFactor));
+                    hwBiasesScaled->attrs().getOrSet<float>("scaleFactor", 1.0f) *= scaleFactor;
+
+                    hwBiases = hwBiasesScaled;
+                }
+            }
+
+            origWeights->attrs().set<Data>("hwBiases", hwBiases);
+        }
+
+        Data hwScales = model->addFakeData();
+        if (scaleFactor != 1.0f) {
+            hwScales = origWeights->attrs().getOrDefault<Data>("hwScales", nullptr);
+
+            if (hwScales == nullptr) {
+                hwScales = model->addConstData(
+                    origStage->name() + "@scales",
+                    DataDesc({extendedOutputDimC}),
+                    replicateContent(1.0f / scaleFactor, extendedOutputDimC));
+
+                origWeights->attrs().set<Data>("hwScales", hwScales);
+            }
+        }
+
+        auto hwStage = model->addNewStage<MyriadXHwStage>(
+            origStage->name(),
+            StageType::MyriadXHwOp,
+            origStage->origLayer(),
+            {hwInput, hwWeights, hwBiases, hwScales},
+            {hwOutput});
+
+        hwStage->attrs().set<HwOpType>("hwOpType", HwOpType::FC);
+
+        hwStage->attrs().set("tiling", tiles);
+
+        hwStage->attrs().set<bool>("withReLU", withReLU);
+
+        hwStage->attrs().set<float>("scaleFactor", scaleFactor);
+
+        //
+        // Remove SW stage
+        //
+
+        model->removeStage(origStage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::hwFullyConnectedTiling() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/hw_padding.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/hw_padding.cpp
new file mode 100644 (file)
index 0000000..f08222c
--- /dev/null
@@ -0,0 +1,206 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <tuple>
+#include <list>
+#include <string>
+#include <limits>
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuidler) : _stageBuilder(stageBuidler) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+bool supportedPaddingPool(const Stage& stage) {
+    IE_ASSERT(StageType::StubMaxPool == stage->type() ||
+              StageType::StubAvgPool == stage->type());
+
+    auto input  = stage->input(0);
+    auto output = stage->output(0);
+
+    auto kernelSizeX  = stage->attrs().get<int>("kernelSizeX");
+    auto kernelSizeY  = stage->attrs().get<int>("kernelSizeY");
+    auto kernelStride = stage->attrs().get<int>("kernelStrideX");
+    auto padLeft      = stage->attrs().get<int>("padLeft");
+    auto padRight     = stage->attrs().get<int>("padRight");
+    auto padTop       = stage->attrs().get<int>("padTop");
+    auto padBottom    = stage->attrs().get<int>("padBottom");
+
+    //
+    // Even kernel size with odd input -> HW bug
+    // Need to add extra border
+    //
+
+    bool forcePaddingStage = false;
+
+    if (kernelSizeX % 2 == 0 && input->desc().dim(Dim::W) % 2 == 1) {
+        if (padRight == 0) {
+            stage->attrs().set<int>("padRight", 1);
+        }
+
+        forcePaddingStage = true;
+    }
+
+    if (kernelSizeY % 2 == 0 && input->desc().dim(Dim::H) % 2 == 1) {
+        if (padBottom == 0) {
+            stage->attrs().set<int>("padBottom", 1);
+        }
+
+        forcePaddingStage = true;
+    }
+
+    auto hwInitialPad = getHwPaddingInfo(
+        input->desc().dims(), output->desc().dims(),
+        kernelSizeX, kernelSizeY,
+        kernelStride, kernelStride);
+
+    bool originalUnsupportedPad = (
+        (padRight  != padLeft && padRight  != padLeft + 1)       ||
+        (padBottom != padTop  && padBottom != padTop + 1)        ||
+        (padLeft   != 0       && padLeft   != (kernelSizeX / 2)) ||
+        (padRight  != 0       && padRight  != (kernelSizeX / 2)) ||
+        (padTop    != 0       && padTop    != (kernelSizeY / 2)) ||
+        (padBottom != 0       && padBottom != (kernelSizeY / 2)));
+
+    bool hwUnsupportedPad = (
+        (hwInitialPad.right  != hwInitialPad.left && hwInitialPad.right  != hwInitialPad.left + 1) ||
+        (hwInitialPad.bottom != hwInitialPad.top  && hwInitialPad.bottom != hwInitialPad.top + 1)  ||
+        (hwInitialPad.left   != 0                 && hwInitialPad.left   != (kernelSizeX / 2))     ||
+        (hwInitialPad.right  != 0                 && hwInitialPad.right  != (kernelSizeX / 2))     ||
+        (hwInitialPad.top    != 0                 && hwInitialPad.top    != (kernelSizeY / 2))     ||
+        (hwInitialPad.bottom != 0                 && hwInitialPad.bottom != (kernelSizeY / 2)));
+
+    return !originalUnsupportedPad &&
+           !hwUnsupportedPad       &&
+           !forcePaddingStage;
+}
+
+bool supportedPaddingConv(const Stage& stage) {
+    IE_ASSERT(StageType::StubConv == stage->type());
+
+    auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+    auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+    auto padLeft     = stage->attrs().get<int>("padLeft");
+    auto padRight    = stage->attrs().get<int>("padRight");
+    auto padTop      = stage->attrs().get<int>("padTop");
+    auto padBottom   = stage->attrs().get<int>("padBottom");
+
+    return (padRight  == padLeft) &&
+           (padBottom == padTop)  &&
+           (padLeft   == 0 || padLeft == (kernelSizeX / 2)) &&
+           (padTop    == 0 || padTop  == (kernelSizeY / 2));
+}
+
+void insertPaddingStageBefore(const Model::Ptr& model, StageBuilder::Ptr& stageBuilder, const Stage& origStage) {
+    auto origInput       = origStage->input(0);
+    auto paddedInputDesc = origInput->desc();
+
+    auto padLeft   = origStage->attrs().get<int>("padLeft");
+    auto padRight  = origStage->attrs().get<int>("padRight");
+    auto padTop    = origStage->attrs().get<int>("padTop");
+    auto padBottom = origStage->attrs().get<int>("padBottom");
+
+    paddedInputDesc.setDim(Dim::W, origInput->desc().dim(Dim::W) + padLeft + padRight);
+    paddedInputDesc.setDim(Dim::H, origInput->desc().dim(Dim::H) + padTop + padBottom);
+
+    auto inputPadded = model->duplicateData(
+        origInput,
+        "@padded",
+        paddedInputDesc);
+
+    model->replaceStageInput(origStage->inputEdge(0), inputPadded);
+
+    auto paddingStage = stageBuilder->addPadStage(
+        model,
+        origStage->name() + "@padding",
+        origStage->origLayer(),
+        (origStage->type() == StageType::StubMaxPool) ? PadMode::Edge : PadMode::Constant,
+        0.0f,
+        DimValues({
+            {Dim::W, padLeft},
+            {Dim::H, padTop},
+        }),
+        DimValues({
+            {Dim::W, padRight},
+            {Dim::H, padBottom},
+        }),
+        origInput,
+        inputPadded);
+
+    origStage->attrs().set<int>("padLeft",   0);
+    origStage->attrs().set<int>("padRight",  0);
+    origStage->attrs().set<int>("padTop",    0);
+    origStage->attrs().set<int>("padBottom", 0);
+}
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(hwPadding);
+
+    auto isPooling = [](const Stage& stage) {
+        return StageType::StubMaxPool == stage->type() ||
+               StageType::StubAvgPool == stage->type();
+    };
+    auto isConv = [](const Stage& stage) {
+        return StageType::StubConv == stage->type();
+    };
+
+    auto stages = model->getStages();
+
+    for (const auto& origStage : stages) {
+        if (!isPooling(origStage) && !isConv(origStage)) {
+            continue;
+        }
+
+        auto tryHW = origStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        bool addPaddingStage = false;
+
+        if (isConv(origStage)) {
+            addPaddingStage = !supportedPaddingConv(origStage);
+        } else if (isPooling(origStage)) {
+            addPaddingStage = !supportedPaddingPool(origStage);
+        } else {
+            IE_ASSERT(false);
+        }
+
+        if (addPaddingStage) {
+            insertPaddingStageBefore(model, _stageBuilder, origStage);
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::hwPadding() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/hw_pooling_tiling.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/hw_pooling_tiling.cpp
new file mode 100644 (file)
index 0000000..28d627f
--- /dev/null
@@ -0,0 +1,762 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <tuple>
+#include <list>
+#include <string>
+#include <limits>
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+const int CHANS_PER_DESCR = 16;
+
+HwPoolTileInfo splitPooling(int outZ) {
+    HwPoolTileInfo tiles;
+    tiles.mode = HwOpMode::MODE_16_16;
+    tiles.numDescr = (outZ + CHANS_PER_DESCR - 1) / CHANS_PER_DESCR;
+    tiles.chansPerDescr = CHANS_PER_DESCR;
+    return tiles;
+}
+
+class Optimizer final {
+public:
+    Optimizer(const std::string& stageName,
+              const DimValues& inputDims, const DimValues& outputDims,
+              int kernelSizeX, int kernelSizeY,
+              int kernelStride,
+              int paddingX, int paddingY)
+        : _stageName(stageName),
+          _inputDims(inputDims), _outputDims(outputDims),
+          _kernelSizeX(kernelSizeX), _kernelSizeY(kernelSizeY),
+          _kernelStride(kernelStride),
+          _paddingX(paddingX), _paddingY(paddingY) {
+    }
+
+    bool optimize() {
+        initTileSizes();
+
+        if (!selectBestTile()) {
+            return false;
+        }
+
+        return createTiles();
+    }
+
+    const HwPoolTilingPtr& getTiling() const {
+        return _tiling;
+    }
+
+private:
+    void initTileSizes() {
+        int tempX = _inputDims[Dim::W] + 2 * _paddingX - _kernelSizeX;
+        int tempY = _inputDims[Dim::H] + 2 * _paddingY - _kernelSizeY;
+
+        int outWidthWithOutCeil = (tempX + _kernelStride) / _kernelStride;
+        int outHeightWithOutCeil = (tempY + _kernelStride) / _kernelStride;
+
+        int outWidthWithCeil =  static_cast<int>(std::ceil(static_cast<double>(tempX) / _kernelStride + 1));
+        int outHeightWithCeil = static_cast<int>(std::ceil(static_cast<double>(tempY) / _kernelStride + 1));
+
+        if ((_outputDims[Dim::W] != outWidthWithCeil) && (_outputDims[Dim::W] != outWidthWithOutCeil)) {
+            VPU_THROW_EXCEPTION
+                    << "Internal error: Output in " << _stageName << " has incorrect width dimension. Expected: "
+                    << outWidthWithCeil << " or " << outWidthWithOutCeil << " Actual: " << _outputDims[Dim::W];
+        }
+
+        if ((_outputDims[Dim::H] != outHeightWithCeil) && (_outputDims[Dim::H] != outHeightWithOutCeil)) {
+            VPU_THROW_EXCEPTION
+                    << "Internal error: Output in " << _stageName << " has incorrect height dimension. Expected: "
+                    << outHeightWithCeil << " or " << outHeightWithOutCeil << " Actual: " << _outputDims[Dim::H];
+        }
+
+        if ((_outputDims[Dim::W] == outWidthWithCeil) && (_outputDims[Dim::H] == outHeightWithCeil)) {
+            _useCeil = true;
+        } else {
+            IE_ASSERT((_outputDims[Dim::W] == outWidthWithOutCeil) && (_outputDims[Dim::H] == outHeightWithOutCeil));
+        }
+
+        _inputTileDims.set(Dim::W, _inputDims[Dim::W]);
+        _inputTileDims.set(Dim::H, _inputDims[Dim::H]);
+        _inputTileDims.set(Dim::C, _inputDims[Dim::C]);
+        _inputTileDims.set(Dim::N, _inputDims.get(Dim::N, 1));
+
+        _outputTileDims.set(Dim::W, _outputDims[Dim::W]);
+        _outputTileDims.set(Dim::H, _outputDims[Dim::H]);
+        _outputTileDims.set(Dim::C, _outputDims[Dim::C]);
+        _outputTileDims.set(Dim::N, _outputDims.get(Dim::N, 1));
+    }
+
+    bool selectBestTile() {
+        struct Solution final {
+            int numWidthTiles = 0;
+            int numHeightTiles = 0;
+            int numBatchTiles = 0;
+            int totalNumTiles = 0;
+            double cost = std::numeric_limits<double>::max();
+        };
+
+        const auto& env = CompileEnv::get();
+
+        // TODO: estimate this numbers
+        const int maxNumWidthTiles = 15;
+        const int maxNumHeightTiles = 15;
+        const int maxNumBatchTiles = _outputDims.get(Dim::N, 1);
+
+        Solution bestSol;
+
+        auto outputTileCopy = _outputTileDims;
+
+        for (int numBatchTiles = 1; numBatchTiles <= maxNumBatchTiles; numBatchTiles++) {
+            //
+            // Filter-out misaligned SoN tiles.
+            //
+
+            if (outputTileCopy[Dim::N] % numBatchTiles != 0) {
+                continue;
+            }
+
+            auto tileDimN = outputTileCopy[Dim::N] / numBatchTiles;
+
+            for (int numWidthTiles = 1; numWidthTiles <= maxNumWidthTiles; numWidthTiles++) {
+                auto inputTileDimW = divUp(_inputDims[Dim::W], numWidthTiles);
+
+                //
+                // Filter-out too small SoW tiles.
+                //
+
+                if (numWidthTiles > 1 && (inputTileDimW < 8 || inputTileDimW < _kernelSizeX)) {
+                    break;
+                }
+
+                for (int numHeightTiles = 1; numHeightTiles <= maxNumHeightTiles ; numHeightTiles++) {
+                    auto inputTileDimH = divUp(_inputDims[Dim::H], numHeightTiles);
+
+                    //
+                    // Filter-out too small SoH tiles.
+                    //
+
+                    if (numHeightTiles > 1 && inputTileDimH < _kernelSizeY) {
+                        break;
+                    }
+
+                    //
+                    // Try current tile size.
+                    //
+
+                    _inputTileDims.set(Dim::W, inputTileDimW);
+                    _inputTileDims.set(Dim::H, inputTileDimH);
+                    _inputTileDims.set(Dim::N, tileDimN);
+
+                    _outputTileDims = outputTileCopy;
+                    _outputTileDims.set(Dim::N, tileDimN);
+                    correctOutputPlaneSize();
+
+                    //
+                    // Check that tiling is valid.
+                    //
+
+                    auto heightTiles = calcHeightTiles();
+                    auto widthTiles = calcWidthTiles();
+
+                    if (heightTiles.empty()) {
+                        continue;
+                    }
+                    if (widthTiles.empty()) {
+                        break;
+                    }
+
+                    bool isOK = true;
+                    double solutionCost = 0.0;
+
+                    for (const auto& heightTile : heightTiles) {
+                        for (const auto& widthTile : widthTiles) {
+                            //
+                            // Output tile fits to CMX limitation.
+                            //
+
+                            DimValues fullOutputTileDims;
+                            fullOutputTileDims.set(Dim::W, widthTile.outputWithJunk);
+                            fullOutputTileDims.set(Dim::H, heightTile.outputWithJunk);
+                            fullOutputTileDims.set(Dim::C, _outputTileDims[Dim::C]);
+                            fullOutputTileDims.set(Dim::N, _outputTileDims[Dim::N]);
+
+                            // TODO: support HCW
+                            if (calculateHwBufferSize(fullOutputTileDims) > env.resources.cmxLimit) {
+                                isOK = false;
+                                break;
+                            }
+
+                            //
+                            // `linesPerChan` restrictions.
+                            //
+
+                            if (heightTile.inputWithJunk < _kernelSizeY) {
+                                isOK = false;
+                                break;
+                            }
+
+                            const uint32_t LOCAL_RAM_SIZE = 128 * 1024;
+                            const uint32_t CMX_DATA_BIT_WIDTH = 128;
+
+                            uint32_t sizeOfBlock = LOCAL_RAM_SIZE >> static_cast<uint32_t>(HwOpMode::MODE_16_16);
+                            uint32_t bytesPerPixel = 1 << (1 - static_cast<uint32_t>(HwDataMode::FP16));
+                            uint32_t pixelsPerCMXLine = CMX_DATA_BIT_WIDTH / (bytesPerPixel * 8u);
+                            uint32_t localLineStride = (widthTile.inputWithJunk + (pixelsPerCMXLine - 1)) / pixelsPerCMXLine;
+                            uint32_t chanPerBlock = 1;
+                            uint32_t availableBytesPerChan = sizeOfBlock / chanPerBlock;
+                            uint32_t bytesPerLine = localLineStride * pixelsPerCMXLine * bytesPerPixel;
+                            uint32_t linesPerChan = availableBytesPerChan / bytesPerLine;
+                            if (linesPerChan < _kernelSizeY) {
+                                isOK = false;
+                                break;
+                            }
+
+                            //
+                            // Replicate padding in case of large input plane - #-16783.
+                            //
+
+                            DimValues fullInputTileDims;
+                            fullInputTileDims.set(Dim::W, widthTile.inputWithJunk);
+                            fullInputTileDims.set(Dim::H, heightTile.inputWithJunk);
+
+                            auto pad = getHwPaddingInfo(
+                                fullInputTileDims, fullOutputTileDims,
+                                _kernelSizeX, _kernelSizeY,
+                                _kernelStride, _kernelStride);
+
+                            if (pad.enable) {
+                                int memPerPlane = alignVal(
+                                            fullInputTileDims[Dim::W], 8) * sizeof(fp16_t)
+                                          * ((fullInputTileDims[Dim::H] - 1) + (_kernelSizeY - 1));
+                                int memLimit = pad.bottom > 0 ? 0x800 : 0x1000;
+                                if (memPerPlane > memLimit) {
+                                    isOK = false;
+                                    break;
+                                }
+                            }
+
+                            //
+                            // Calc tile cost.
+                            //
+
+                            auto noOfBlocks = 1 << static_cast<int>(HwOpMode::MODE_16_16);
+                            solutionCost += 1.0
+                                  * ((_inputTileDims[Dim::C] * _inputTileDims[Dim::N]) / noOfBlocks) * _kernelSizeX * _kernelSizeY
+                                  * numBatchTiles;
+
+                            // Alignment for output
+                            if ((widthTile.outputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                                solutionCost += 1.0
+                                      * widthTile.outputWithJunk
+                                      * heightTile.outputWithJunk
+                                      * _outputTileDims[Dim::C]
+                                      * _outputTileDims[Dim::N];
+                            }
+
+                            // Alignment for input
+                            if ((widthTile.inputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                                solutionCost += 1.0
+                                      * widthTile.inputWithJunk
+                                      * heightTile.inputWithJunk
+                                      * _inputTileDims[Dim::C]
+                                      * _inputTileDims[Dim::N];
+                            }
+                        }
+
+                        if (!isOK) {
+                            break;
+                        }
+                    }
+
+                    if (!isOK) {
+                        continue;
+                    }
+
+                    //
+                    // Compare with current best solution.
+                    //
+
+                    Solution curSol;
+                    curSol.numWidthTiles = numWidthTiles;
+                    curSol.numHeightTiles = numHeightTiles;
+                    curSol.numBatchTiles = numBatchTiles;
+                    curSol.totalNumTiles = numWidthTiles * numHeightTiles * numBatchTiles;
+                    curSol.cost = solutionCost;
+
+                    if (curSol.cost < bestSol.cost || (isDoubleEqual(curSol.cost, bestSol.cost) && curSol.totalNumTiles < bestSol.totalNumTiles)) {
+                        bestSol = curSol;
+                    }
+                }
+            }
+        }
+
+        if (bestSol.totalNumTiles == 0) {
+            return false;
+        }
+
+        int inputTileDimW = divUp(_inputDims[Dim::W], bestSol.numWidthTiles);
+        int inputTileDimH = divUp(_inputDims[Dim::H], bestSol.numHeightTiles);
+        auto tileDimN = outputTileCopy[Dim::N] / bestSol.numBatchTiles;
+
+        _inputTileDims.set(Dim::W, inputTileDimW);
+        _inputTileDims.set(Dim::H, inputTileDimH);
+        _inputTileDims.set(Dim::N, tileDimN);
+
+        _outputTileDims = outputTileCopy;
+        _outputTileDims.set(Dim::N, tileDimN);
+        correctOutputPlaneSize();
+
+        return true;
+    }
+
+    bool createTiles() {
+        auto heightTiles = calcHeightTiles();
+        IE_ASSERT(!heightTiles.empty());
+
+        auto widthTiles = calcWidthTiles();
+        IE_ASSERT(!widthTiles.empty());
+
+        _tiling = std::make_shared<HwPoolTiling>();
+        _tiling->sohTiles = heightTiles.size();
+        _tiling->sowTiles = widthTiles.size();
+        _tiling->socTiles = divUp(_inputDims.get(Dim::N, 1), _inputTileDims[Dim::N]);
+
+        for (int sohInd = 0; sohInd < _tiling->sohTiles; ++sohInd) {
+            const auto& heightTileInfo = heightTiles[sohInd];
+
+            for (int sowInd = 0; sowInd < _tiling->sowTiles; ++sowInd) {
+                const auto& widthTileInfo = widthTiles[sowInd];
+
+                auto planeTile = std::make_shared<HwPoolPlaneTile>();
+                planeTile->parent = _tiling;
+
+                planeTile->sohInd = sohInd;
+                planeTile->sowInd = sowInd;
+
+                planeTile->heightInfo = heightTileInfo;
+                planeTile->widthInfo = widthTileInfo;
+
+                for (int socInd = 0; socInd < _tiling->socTiles; ++socInd) {
+                    auto channelTile = std::make_shared<HwPoolChannelTile>();
+                    channelTile->parent = planeTile;
+
+                    channelTile->socInd = socInd;
+
+                    channelTile->finalTiles = splitPooling(_inputTileDims[Dim::C] * _inputTileDims[Dim::N]);
+
+                    if (channelTile->finalTiles.numDescr == 0) {
+                        return false;
+                    }
+
+                    channelTile->channelStartIndex = socInd * _inputTileDims[Dim::N];
+                    channelTile->numInputChannels = _inputTileDims[Dim::N];
+
+                    planeTile->channelTiles.emplace_back(channelTile);
+                }
+
+                _tiling->planeTiles.emplace_back(planeTile);
+            }
+        }
+
+        return true;
+    }
+
+private:
+    void correctOutputPlaneSize() {
+        int maxOutputWidth = calcOutputSize(_inputTileDims[Dim::W], _kernelSizeX, _kernelStride, _paddingX, _paddingX, _useCeil);
+        _outputTileDims.set(Dim::W, std::min(_outputTileDims[Dim::W], maxOutputWidth));
+
+        int maxOutputHeight = calcOutputSize(_inputTileDims[Dim::H], _kernelSizeY, _kernelStride, _paddingY, _paddingY, _useCeil);
+        _outputTileDims.set(Dim::H, std::min(_outputTileDims[Dim::H], maxOutputHeight));
+    }
+
+    std::vector<HwPlaneTileInfo> calcHeightTiles() {
+        std::vector<HwPlaneTileInfo> heightTiles;
+
+        if (_outputTileDims[Dim::H] == _outputDims[Dim::H]) {
+            HwPlaneTileInfo info;
+            info.inputWithJunk = _inputDims[Dim::H];
+            info.outputWithJunk = _outputDims[Dim::H];
+            info.outputJunkBefore = 0;
+            info.outputJunkAfter = 0;
+            info.inputStartIndex = 0;
+            info.inputEndIndex = _inputDims[Dim::H];
+            info.outputStartIndex = 0;
+            info.outputEndIndex = _outputDims[Dim::H];
+
+            heightTiles.emplace_back(info);
+        } else {
+            heightTiles = splitIntoPlaneTiles(
+                _inputDims[Dim::H],
+                _outputDims[Dim::H],
+                _kernelSizeY,
+                _kernelStride,
+                _paddingY, _paddingY,
+                _outputTileDims[Dim::H],
+                false,
+                _useCeil);
+        }
+
+        return heightTiles;
+    }
+
+    std::vector<HwPlaneTileInfo> calcWidthTiles() {
+        std::vector<HwPlaneTileInfo> widthTiles;
+
+        if (_outputTileDims[Dim::W] == _outputDims[Dim::W]) {
+            HwPlaneTileInfo info;
+            info.inputWithJunk = _inputDims[Dim::W];
+            info.outputWithJunk = _outputDims[Dim::W];
+            info.outputJunkBefore = 0;
+            info.outputJunkAfter = 0;
+            info.inputStartIndex = 0;
+            info.inputEndIndex = _inputDims[Dim::W];
+            info.outputStartIndex = 0;
+            info.outputEndIndex = _outputDims[Dim::W];
+
+            widthTiles.emplace_back(info);
+        } else {
+            widthTiles = splitIntoPlaneTiles(
+                _inputDims[Dim::W],
+                _outputDims[Dim::W],
+                _kernelSizeX,
+                _kernelStride,
+                _paddingX, _paddingX,
+                _outputTileDims[Dim::W],
+                true,
+                _useCeil);
+        }
+
+        return widthTiles;
+    }
+
+private:
+    std::string _stageName;
+
+    DimValues _inputDims;
+    DimValues _outputDims;
+
+    int _kernelSizeX = 0;
+    int _kernelSizeY = 0;
+    int _kernelStride = 0;
+    int _paddingX = 0;
+    int _paddingY = 0;
+
+    DimValues _inputTileDims;
+    DimValues _outputTileDims;
+
+    HwPoolTilingPtr _tiling;
+
+    bool _useCeil = false;
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuidler) : _stageBuidler(stageBuidler) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuidler;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(hwPoolTiling);
+
+    for (const auto& origStage : model->getStages()) {
+        if (origStage->type() != StageType::StubMaxPool &&
+            origStage->type() != StageType::StubAvgPool) {
+            continue;
+        }
+
+        auto tryHW = origStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        auto origInput = origStage->input(0);
+        auto origOutput = origStage->output(0);
+
+        auto kernelSizeX = origStage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = origStage->attrs().get<int>("kernelSizeY");
+        auto kernelStride = origStage->attrs().get<int>("kernelStrideX");
+        auto padLeft = origStage->attrs().get<int>("padLeft");
+        auto padRight = origStage->attrs().get<int>("padRight");
+        auto padTop = origStage->attrs().get<int>("padTop");
+        auto padBottom = origStage->attrs().get<int>("padBottom");
+
+        auto withReLU = origStage->attrs().getOrDefault<bool>("withReLU", false);
+
+        auto hwInput  = origInput;
+        auto hwOutput = origOutput;
+
+        //
+        // Try to find "best" tiling
+        //
+
+        Optimizer opt(origStage->name(),
+                      hwInput->desc().dims(), hwOutput->desc().dims(),
+                      kernelSizeX, kernelSizeY,
+                      kernelStride,
+                      padLeft, padTop);
+
+        if (!opt.optimize()) {
+            origStage->attrs().set<bool>("tryHW", false);
+
+            auto swOutput = origOutput;
+            if (withReLU) {
+                swOutput = model->addNewData(
+                    origStage->name(),
+                    origOutput->desc());
+                swOutput->attrs().copyFrom(origOutput->attrs());
+
+                model->replaceStageOutput(origStage->outputEdge(0), swOutput);
+
+                _stageBuidler->addReLUStage(
+                    model,
+                    origStage->name() + "@ReLU",
+                    origStage->origLayer(),
+                    0.0,
+                    swOutput,
+                    origOutput);
+            }
+
+            continue;
+        }
+
+        //
+        // Create HW tiles
+        //
+
+        model->disconnectStageDatas(origStage);
+
+        const auto& tiling = opt.getTiling();
+
+        DataVector hwInputTiles;
+        std::vector<DimValues> hwInputTilesOffsets;
+
+        DataVector hwOutputTiles;
+        std::vector<DimValues> hwOutputTilesOffsets;
+
+        for (const auto& planeTile : tiling->planeTiles) {
+            for (const auto& channelTile : planeTile->channelTiles) {
+                auto tilePostfix = getPlaneTilePostfix(planeTile) + getChannelTilePostfix(channelTile);
+
+                //
+                // Create input tile
+                //
+
+                Data hwInputTile;
+
+                if (tiling->sohTiles == 1 && tiling->sowTiles == 1 && tiling->socTiles == 1) {
+                    hwInputTile = hwInput;
+                } else {
+                    auto newDesc = hwInput->desc();
+                    newDesc.setDim(Dim::W, planeTile->widthInfo.inputWithJunk);
+                    newDesc.setDim(Dim::H, planeTile->heightInfo.inputWithJunk);
+                    newDesc.setDim(Dim::N, channelTile->numInputChannels);
+
+                    hwInputTile = model->duplicateData(
+                        hwInput,
+                        tilePostfix,
+                        newDesc);
+
+                    hwInputTiles.emplace_back(hwInputTile);
+                    hwInputTilesOffsets.emplace_back(
+                        DimValues({
+                            {Dim::W, planeTile->widthInfo.inputStartIndex},
+                            {Dim::H, planeTile->heightInfo.inputStartIndex},
+                            {Dim::N, channelTile->channelStartIndex}
+                        }));
+                }
+
+                //
+                // Add alignement to input tile if needed
+                //
+
+                if ((planeTile->widthInfo.inputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                    auto hwInputTileAligned = model->duplicateData(
+                        hwInputTile,
+                        "@aligned");
+
+                    _stageBuidler->addCopyStage(
+                        model,
+                        origStage->name() + tilePostfix + "@align-input-ptr",
+                        origStage->origLayer(),
+                        hwInputTile,
+                        hwInputTileAligned);
+
+                    hwInputTile = hwInputTileAligned;
+                }
+
+                //
+                // Create output tile
+                //
+
+                Data hwOutputTile;
+
+                if (tiling->sohTiles == 1 && tiling->sowTiles == 1 && tiling->socTiles == 1) {
+                    hwOutputTile = hwOutput;
+                } else {
+                    auto newDesc = hwOutput->desc();
+                    newDesc.setDim(Dim::W, planeTile->widthInfo.outputEndIndex - planeTile->widthInfo.outputStartIndex);
+                    newDesc.setDim(Dim::H, planeTile->heightInfo.outputEndIndex - planeTile->heightInfo.outputStartIndex);
+                    newDesc.setDim(Dim::N, channelTile->numInputChannels);
+
+                    hwOutputTile = model->duplicateData(
+                        hwOutput,
+                        tilePostfix,
+                        newDesc);
+
+                    hwOutputTiles.emplace_back(hwOutputTile);
+                    hwOutputTilesOffsets.emplace_back(
+                        DimValues({
+                            {Dim::W, planeTile->widthInfo.outputStartIndex},
+                            {Dim::H, planeTile->heightInfo.outputStartIndex},
+                            {Dim::N, channelTile->channelStartIndex}
+                        }));
+                }
+
+                //
+                // Add alignement to output tile if needed
+                //
+
+                if ((planeTile->widthInfo.outputStartIndex * sizeof(fp16_t)) % 16 != 0) {
+                    auto hwOutputTileAligned = model->duplicateData(
+                        hwOutputTile,
+                        "@aligned");
+
+                    _stageBuidler->addCopyStage(
+                        model,
+                        origStage->name() + tilePostfix + "@align-output-ptr",
+                        origStage->origLayer(),
+                        hwOutputTileAligned,
+                        hwOutputTile);
+
+                    hwOutputTile = hwOutputTileAligned;
+                }
+
+                //
+                // Process output junk if needed
+                //
+
+                if (planeTile->heightInfo.outputJunkBefore != 0 ||
+                    planeTile->heightInfo.outputJunkAfter != 0 ||
+                    planeTile->widthInfo.outputJunkBefore != 0 ||
+                    planeTile->widthInfo.outputJunkAfter != 0) {
+                    auto newDesc = hwOutputTile->desc();
+                    newDesc.setDim(Dim::W, planeTile->widthInfo.outputWithJunk);
+                    newDesc.setDim(Dim::H, planeTile->heightInfo.outputWithJunk);
+
+                    auto hwOutputTileWithJunk = model->duplicateData(
+                        hwOutputTile,
+                        "@with-junk",
+                        newDesc);
+
+                    DimValues innerOffset;
+                    innerOffset.set(Dim::W, planeTile->widthInfo.outputJunkBefore);
+                    innerOffset.set(Dim::H, planeTile->heightInfo.outputJunkBefore);
+
+                    _stageBuidler->addShrinkStage(
+                        model,
+                        origStage->name() + tilePostfix + "@remove-junk",
+                        origStage->origLayer(),
+                        hwOutputTileWithJunk,
+                        hwOutputTile,
+                        innerOffset);
+
+                    hwOutputTile = hwOutputTileWithJunk;
+                }
+
+                //
+                // Create HW stage for tile
+                //
+
+                auto hwPad = getHwPaddingInfo(
+                    hwInputTile->desc().dims(), hwOutputTile->desc().dims(),
+                    kernelSizeX, kernelSizeY,
+                    kernelStride, kernelStride);
+
+                auto hwTileWeights = model->addFakeData();
+                auto hwTileBiases = model->addFakeData();
+                auto hwTileScales = model->addFakeData();
+
+                auto hwStage = model->addNewStage<MyriadXHwStage>(
+                    origStage->name() + tilePostfix,
+                    StageType::MyriadXHwOp,
+                    origStage->origLayer(),
+                    {hwInputTile, hwTileWeights, hwTileBiases, hwTileScales},
+                    {hwOutputTile});
+
+                hwStage->attrs().set<HwOpType>("hwOpType", HwOpType::POOL);
+                hwStage->attrs().set<HwPoolType>("poolType", origStage->type() == StageType::StubMaxPool ? HwPoolType::MAX : HwPoolType::AVERAGE);
+
+                hwStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+                hwStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+                hwStage->attrs().set<int>("kernelStride", kernelStride);
+
+                hwStage->attrs().set("pad", hwPad);
+
+                hwStage->attrs().set<HwPoolTileInfo>("tiling", channelTile->finalTiles);
+
+                hwStage->attrs().set<bool>("withReLU", withReLU);
+            }
+        }
+
+        //
+        // Split/concat input/output tiles
+        //
+
+        if (!hwInputTiles.empty()) {
+            _stageBuidler->addSplitStage(
+                model,
+                origStage->name() + "@split-input",
+                origStage->origLayer(),
+                hwInputTilesOffsets,
+                hwInput,
+                hwInputTiles);
+        }
+
+        if (!hwOutputTiles.empty()) {
+            _stageBuidler->addConcatStage(
+                model,
+                origStage->name() + "@concat-output",
+                origStage->origLayer(),
+                hwOutputTilesOffsets,
+                hwOutputTiles,
+                hwOutput);
+        }
+
+        //
+        // Remove SW stage
+        //
+
+        model->removeStage(origStage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::hwPoolTiling() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/inject_sw.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/inject_sw.cpp
new file mode 100644 (file)
index 0000000..58525dd
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <algorithm>
+#include <unordered_set>
+#include <unordered_map>
+#include <limits>
+#include <string>
+#include <memory>
+#include <set>
+#include <list>
+
+#include <vpu/allocator.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+void printTo(std::ostream&, const std::list<Stage>::iterator&) {
+}
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(injectSw);
+
+    const int nMaxStagesForInjectSw = 30000;
+    const auto& env = CompileEnv::get();
+
+    //
+    // Collect HW and SW candidates
+    //
+
+    if (!env.config.injectSwOps.hasValue() &&
+        model->numStages() > nMaxStagesForInjectSw) {
+        env.log->warning(
+            "Pass [injectSw] SKIPPED : number of stages (%d) is larger than threshold %d",
+            model->numStages(), nMaxStagesForInjectSw);
+        return;
+    }
+
+    StageVector hwStages;
+    std::list<Stage> swStages;
+
+    hwStages.reserve(model->numStages());
+    for (const auto& stage : model->getStages()) {
+        if (stage->category() == StageCategory::HW) {
+            hwStages.emplace_back(stage);
+        } else if (stage->category() == StageCategory::DMA || stage->category() == StageCategory::SHAVE) {
+            if (stage->getSHAVEsRequirements() != StageSHAVEsRequirements::NeedMax) {
+                auto it = swStages.emplace(swStages.end(), stage);
+                stage->attrs().set<std::list<Stage>::iterator>("swStagesPos", it);
+            }
+        }
+    }
+
+    //
+    // Try to merge HW and SW stages
+    //
+
+    for (const auto& hwStage : hwStages) {
+        for (const auto& swStage : swStages) {
+            model->buildStageOrder();
+
+            auto hwInd = hwStage->index();
+            IE_ASSERT(hwInd >= 0);
+
+            auto swInd = swStage->index();
+            IE_ASSERT(swInd >= 0);
+            IE_ASSERT(swInd != hwInd);
+
+            //
+            // Check execution order
+            //
+
+            bool isOK = true;
+
+            if (swInd > hwInd) {
+                //
+                // SW producer must be executed after HW stage
+                //
+
+                for (const auto& swProducer : swStage->prevStages()) {
+                    auto swProducerInd = swProducer->index();
+                    IE_ASSERT(swProducerInd >= 0);
+                    IE_ASSERT(swProducerInd < swInd);
+
+                    if (swProducerInd >= hwInd) {
+                        isOK = false;
+                        break;
+                    }
+                }
+            } else {
+                //
+                // HW producer must be executed after SW stage
+                //
+
+                for (const auto& hwProducer : hwStage->prevStages()) {
+                    auto hwProducerInd = hwProducer->index();
+                    IE_ASSERT(hwProducerInd >= 0);
+                    IE_ASSERT(hwProducerInd < hwInd);
+
+                    if (hwProducerInd >= swInd) {
+                        isOK = false;
+                        break;
+                    }
+                }
+            }
+
+            if (!isOK) {
+                continue;
+            }
+
+            //
+            // Try to inject and check allocation, if it is failed -> revert
+            //
+
+            auto edge = model->injectStage()
+                    .parentHW(hwStage)
+                    .childSW(swStage)
+                    .done();
+
+            auto allocRes = runAllocator(model, true);
+            if (allocRes.status == AllocationStatus::OK) {
+                // TODO: try to merge more than one SW stage?
+                break;
+            } else {
+                model->revertInjection(edge);
+            }
+        }
+
+        //
+        // Remove injected stages from candidates list
+        //
+
+        for (const auto& injectedStageEdge : hwStage->injectedStageEdges()) {
+            auto it = injectedStageEdge->child()->attrs().get<std::list<Stage>::iterator>("swStagesPos");
+
+            IE_ASSERT(it != swStages.end());
+            swStages.erase(it);
+
+            injectedStageEdge->child()->attrs().erase("swStagesPos");
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::injectSw() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/merge_hw_stages.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/merge_hw_stages.cpp
new file mode 100644 (file)
index 0000000..1ca839d
--- /dev/null
@@ -0,0 +1,212 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <set>
+#include <memory>
+#include <unordered_set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/hw/utility.hpp>
+#include <vpu/sw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+Stage getNextPoolStage(const Stage& stage, const Data& output) {
+    auto input = stage->input(0);
+
+    if (input->desc().dim(Dim::W) % 2 != 0 ||
+        input->desc().dim(Dim::H) % 2 != 0 ||
+        output->desc().dim(Dim::W) % 2 != 0 ||
+        output->desc().dim(Dim::H) % 2 != 0) {
+        return nullptr;
+    }
+
+    auto nextPool = getNextStage(stage, {StageType::StubMaxPool});
+    if (nextPool == nullptr) {
+        return nullptr;
+    }
+
+    if (!nextPool->attrs().getOrDefault<bool>("tryHW", false)) {
+        return nullptr;
+    }
+
+    auto poolOutput = nextPool->output(0);
+
+    if (poolOutput->desc().dim(Dim::W) % 2 != 0 ||
+        poolOutput->desc().dim(Dim::H) % 2 != 0) {
+        return nullptr;
+    }
+
+    auto convKernelSizeX = stage->attrs().get<int>("kernelSizeX");
+    auto convKernelSizeY = stage->attrs().get<int>("kernelSizeY");
+    auto convKernelStride = stage->attrs().get<int>("kernelStrideX");
+    auto convPadLeft = stage->attrs().get<int>("padLeft");
+    auto convPadRight = stage->attrs().get<int>("padRight");
+    auto convPadTop = stage->attrs().get<int>("padTop");
+    auto convPadBottom = stage->attrs().get<int>("padBottom");
+
+    auto poolKernelSizeX = nextPool->attrs().get<int>("kernelSizeX");
+    auto poolKernelSizeY = nextPool->attrs().get<int>("kernelSizeY");
+    auto poolKernelStride = nextPool->attrs().get<int>("kernelStrideX");
+    auto poolPadLeft = nextPool->attrs().get<int>("padLeft");
+    auto poolPadRight = nextPool->attrs().get<int>("padRight");
+    auto poolPadTop = nextPool->attrs().get<int>("padTop");
+    auto poolPadBottom = nextPool->attrs().get<int>("padBottom");
+
+    // TODO: check which convolution and pooling parameters are supported
+
+    if (convKernelSizeX == 3 && convKernelSizeY == 3 &&
+        convKernelStride == 1 &&
+        convPadLeft == 1 && convPadRight == 1 && convPadTop == 1 && convPadBottom == 1 &&
+        poolKernelSizeX == 2 && poolKernelSizeY == 2 &&
+        poolKernelStride == 2 &&
+        poolPadLeft == 0 && poolPadRight == 0 && poolPadTop == 0 && poolPadBottom == 0) {
+        return nextPool;
+    }
+
+    return nullptr;
+}
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(mergeHwStages);
+
+    const auto& env = CompileEnv::get();
+
+    for (const auto& stage : model->getStages()) {
+        if (stage == nullptr) {
+            continue;
+        }
+
+        auto tryHW = stage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        IE_ASSERT(stage->numOutputs() == 1);
+
+        auto output = stage->output(0);
+
+        if (stage->type() == StageType::StubConv) {
+            stage->attrs().set("origConvOutput", output->desc());
+        }
+
+        //
+        // Try to merge next ReLU layer or Clamp
+        //
+
+        std::unordered_set<StageType, EnumClassHash> supportedPostOps{StageType::Relu};
+        if (stage->type() == StageType::StubConv) {
+            supportedPostOps.insert(StageType::LeakyRelu);
+            supportedPostOps.insert(StageType::Clamp);
+        }
+
+        if (auto nextPostOpStage = getNextStage(stage, {StageType::Relu, StageType::LeakyRelu, StageType::Clamp})) {
+            bool isOK = true;
+
+            if (nextPostOpStage->type() == StageType::Clamp) {
+                auto min_value = nextPostOpStage->attrs().get<float>("min_value");
+
+                if (!isFloatEqual(min_value, 0.0f)) {
+                    isOK = false;
+                }
+            }
+
+            if (nextPostOpStage->type() == StageType::LeakyRelu) {
+                auto negativeSlope = nextPostOpStage->attrs().get<float>("negativeSlope");
+
+                if (!isFloatEqual(negativeSlope, 0.0f)) {
+                    // Only integer scales are supported
+
+                    auto reverseScale = 1.0f / negativeSlope;
+
+                    if (!isFloatEqual(std::fabs(std::ceil(reverseScale) - reverseScale), 0.0f)) {
+                        isOK = false;
+                    }
+                }
+            }
+
+            if (isOK) {
+                output = nextPostOpStage->output(0);
+
+                model->disconnectStageDatas(nextPostOpStage);
+
+                model->replaceStageOutput(stage->outputEdge(0), output);
+
+                if (nextPostOpStage->type() == StageType::Clamp) {
+                    auto max_value = nextPostOpStage->attrs().get<float>("max_value");
+                    stage->attrs().set<bool>("withClamp", true);
+                    stage->attrs().set<float>("clampMax", max_value);
+                } else {
+                    auto negativeSlope = nextPostOpStage->attrs().get<float>("negativeSlope");
+
+                    stage->attrs().set<bool>("withReLU", true);
+                    stage->attrs().set<float>("negativeSlope", negativeSlope);
+                    if (nextPostOpStage->type() == StageType::Relu) {
+                        stage->attrs().set<uint32_t>("a0", 0);
+                        stage->attrs().set<uint32_t>("a1", 1);
+                        stage->attrs().set<float>("reluScale", 1.0f);
+                    } else  {
+                        stage->attrs().set<uint32_t>("a0", 1);
+                        stage->attrs().set<uint32_t>("a1", 1.0f / negativeSlope);
+                        stage->attrs().set<float>("reluScale", negativeSlope);
+                    }
+                }
+
+                model->removeStage(nextPostOpStage);
+            }
+        }
+
+        //
+        // Try to merge next Pooling layer
+        //
+
+        if (env.config.mergeHwPoolToConv) {
+            if (stage->type() == StageType::StubConv) {
+                if (auto nextPoolStage = getNextPoolStage(stage, output)) {
+                    output = nextPoolStage->output(0);
+
+                    model->disconnectStageDatas(nextPoolStage);
+
+                    model->replaceStageOutput(stage->outputEdge(0), output);
+
+                    auto poolKernelSizeX = nextPoolStage->attrs().get<int>("kernelSizeX");
+                    auto poolKernelSizeY = nextPoolStage->attrs().get<int>("kernelSizeY");
+                    auto poolKernelStride = nextPoolStage->attrs().get<int>("kernelStrideX");
+                    auto poolPadLeft = nextPoolStage->attrs().get<int>("padLeft");
+                    auto poolPadRight = nextPoolStage->attrs().get<int>("padRight");
+                    auto poolPadTop = nextPoolStage->attrs().get<int>("padTop");
+                    auto poolPadBottom = nextPoolStage->attrs().get<int>("padBottom");
+
+                    stage->attrs().set<bool>("withPool", true);
+                    stage->attrs().set<int>("poolKernelSizeX", poolKernelSizeX);
+                    stage->attrs().set<int>("poolKernelSizeY", poolKernelSizeY);
+                    stage->attrs().set<int>("poolKernelStride", poolKernelStride);
+                    stage->attrs().set<int>("poolPadLeft", poolPadLeft);
+                    stage->attrs().set<int>("poolPadRight", poolPadRight);
+                    stage->attrs().set<int>("poolPadTop", poolPadTop);
+                    stage->attrs().set<int>("poolPadBottom", poolPadBottom);
+
+                    model->removeStage(nextPoolStage);
+                }
+            }
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::mergeHwStages() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/merge_relu_and_bias.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/merge_relu_and_bias.cpp
new file mode 100644 (file)
index 0000000..47cdf08
--- /dev/null
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <set>
+#include <memory>
+
+#include <vpu/sw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(mergeReLUAndBias);
+
+    for (const auto& biasStage : model->getStages()) {
+        if (biasStage == nullptr) {
+            continue;
+        }
+
+        if (biasStage->type() != StageType::Bias) {
+            continue;
+        }
+
+        if (auto reluStage = getNextStage(biasStage, {StageType::Relu, StageType::LeakyRelu})) {
+            auto biasInput = biasStage->input(0);
+            auto biases = biasStage->input(1);
+
+            auto reluOutput = reluStage->output(0);
+
+            auto reluStageName = reluStage->name();
+            auto reluOrigLayer = reluStage->origLayer();
+            auto negativeSlope = reluStage->attrs().get<float>("negativeSlope");
+
+            model->removeStage(biasStage);
+            model->removeStage(reluStage);
+
+            _stageBuilder->addReLUStage(
+                model,
+                reluStageName,
+                reluOrigLayer,
+                negativeSlope,
+                biasInput,
+                reluOutput,
+                biases);
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::mergeReLUAndBias() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/process_special_stages.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/process_special_stages.cpp
new file mode 100644 (file)
index 0000000..547d906
--- /dev/null
@@ -0,0 +1,674 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <set>
+#include <unordered_set>
+
+#include <vpu/allocator.hpp>
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    void processConcat(const Model::Ptr& model, const Stage& stage);
+    void processSplit(const Model::Ptr& model, const Stage& stage);
+    void processReshape(const Model::Ptr& model, const Stage& stage);
+    void processExpand(const Model::Ptr& model, const Stage& stage);
+    void processShrink(const Model::Ptr& model, const Stage& stage);
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(processSpecialStages);
+
+    //
+    // Merge multiple Expand stages applied to the same input.
+    //
+
+    for (const auto& curExpandStage : model->getStages()) {
+        if (curExpandStage == nullptr) {
+            continue;
+        }
+
+        if (curExpandStage->type() != StageType::Expand) {
+            continue;
+        }
+
+        auto input = curExpandStage->input(0);
+        auto output = curExpandStage->output(0);
+
+        bool hasDuplicates = false;
+        for (const auto& inputConsumer : input->consumers()) {
+            if (inputConsumer->type() != StageType::Expand) {
+                continue;
+            }
+
+            if (inputConsumer == curExpandStage) {
+                continue;
+            }
+
+            hasDuplicates = true;
+
+            auto otherOutput = inputConsumer->output(0);
+
+            if (otherOutput->desc().dims() != output->desc().dims()) {
+                hasDuplicates = false;
+                break;
+            }
+
+            if (otherOutput->usage() != DataUsage::Intermediate) {
+                hasDuplicates = false;
+                break;
+            }
+        }
+
+        if (!hasDuplicates) {
+            continue;
+        }
+
+        for (const auto& inputConsumer : input->consumers()) {
+            if (inputConsumer->type() != StageType::Expand) {
+                continue;
+            }
+
+            if (inputConsumer == curExpandStage) {
+                continue;
+            }
+
+            auto otherOutput = inputConsumer->output(0);
+
+            for (const auto& outputConsumerEdge : otherOutput->consumerEdges()) {
+                model->replaceStageInput(outputConsumerEdge, output);
+            }
+
+            model->removeStage(inputConsumer);
+        }
+    }
+
+    //
+    // Add Copy stages when needed.
+    //
+
+    for (const auto& stage : model->getStages()) {
+        if (stage == nullptr) {
+            continue;
+        }
+
+        if (stage->type() == StageType::Concat) {
+            processConcat(model, stage);
+        } else if (stage->type() == StageType::Split) {
+            processSplit(model, stage);
+        } else if (stage->type() == StageType::Reshape) {
+            processReshape(model, stage);
+        } else if (stage->type() == StageType::Expand) {
+            processExpand(model, stage);
+        } else if (stage->type() == StageType::Shrink) {
+            processShrink(model, stage);
+        }
+    }
+}
+
+void PassImpl::processConcat(const Model::Ptr& model, const Stage& stage) {
+    auto output = stage->output(0);
+
+    const auto& offsets = stage->attrs().get<std::vector<DimValues>>("offsets");
+    IE_ASSERT(offsets.size() == stage->numInputs());
+
+    for (const auto& inEdge : stage->inputEdges()) {
+        IE_ASSERT(inEdge->portInd() >= 0);
+        IE_ASSERT(inEdge->portInd() < offsets.size());
+
+        auto input = inEdge->input();
+        const auto& offsetFromOutput = offsets[inEdge->portInd()];
+
+        IE_ASSERT(input->desc().dimsOrder() == output->desc().dimsOrder());
+        IE_ASSERT(offsetFromOutput.size() <= output->desc().numDims());
+        for (const auto& p : offsetFromOutput) {
+            IE_ASSERT(output->desc().dimsOrder().hasDim(p.first));
+            IE_ASSERT(p.second + input->desc().dim(p.first) <= output->desc().dim(p.first));
+        }
+
+        //
+        // Check if we need to insert Copy stage
+        //
+
+        bool needCopy = false;
+        bool optionalCopy = false;
+        if (input->usage() != DataUsage::Intermediate) {
+            needCopy = true;
+            optionalCopy = false;
+        } else if (input->parentDataEdge() != nullptr) {
+            needCopy = true;
+            optionalCopy = false;
+        } else {
+            //
+            // Check input StridesRequirement.
+            //
+
+            IE_ASSERT(input->checkStrides(input->requiredStrides()));
+            if (!checkStrides(input->desc(), output->strides(), input->requiredStrides())) {
+                needCopy = true;
+                optionalCopy = false;
+            }
+
+            //
+            // Check consumers StridesRequirement.
+            //
+
+            if (!needCopy) {
+                for (const auto& consumer : input->consumers()) {
+                    auto consumerInfo = consumer->getDataStridesRequirements();
+
+                    auto strideIt = consumerInfo.find(input);
+                    if (strideIt != consumerInfo.end()) {
+                        auto consumerStrideReqs = strideIt->second;
+                        IE_ASSERT(input->checkStrides(consumerStrideReqs));
+
+                        if (!checkStrides(input->desc(), output->strides(), consumerStrideReqs)) {
+                            needCopy = true;
+                            optionalCopy = false;
+                        }
+                    }
+                }
+            }
+
+            //
+            // Check producer StridesRequirement.
+            //
+
+            if (!needCopy) {
+                if (auto producer = input->producer()) {
+                    auto producerInfo = producer->getDataStridesRequirements();
+
+                    auto strideIt = producerInfo.find(input);
+                    if (strideIt != producerInfo.end()) {
+                        auto producerStrideReqs = strideIt->second;
+                        IE_ASSERT(input->checkStrides(producerStrideReqs));
+
+                        if (!checkStrides(input->desc(), output->strides(), producerStrideReqs)) {
+                            needCopy = true;
+                            optionalCopy = false;
+                        }
+                    }
+
+                    if (!needCopy) {
+                        //
+                        // To reduce the size of HW output (still can be optimized).
+                        //
+
+                        if (producer->category() == StageCategory::HW) {
+                            needCopy = true;
+                            optionalCopy = true;
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Insert Copy if needed
+        //
+
+        if (needCopy) {
+            Data inputCopy;
+            if (input->usage() == DataUsage::Const) {
+                inputCopy = model->addNewData(
+                    input->name() + "@copy",
+                    input->desc());
+            } else {
+                inputCopy = model->duplicateData(
+                    input,
+                    "@copy");
+                inputCopy->resetRequiredStrides();
+            }
+
+            auto copyStage = _stageBuilder->addCopyStage(
+                model,
+                formatString("%s@input=%d@copy-for-concat", stage->name(), inEdge->portInd()),
+                stage->origLayer(),
+                input,
+                inputCopy);
+            copyStage->attrs().set<bool>("optional", optionalCopy);
+
+            model->replaceStageInput(inEdge, inputCopy);
+
+            input = inputCopy;
+        }
+
+        //
+        // Add Data<->Data edge
+        //
+
+        model->connectDatas()
+                .parent(output)
+                .child(input)
+                .mode(SharedDataMode::ROI)
+                .order(SharedDataOrder::ChildWritesToParent)
+                .offset(offsetFromOutput)
+                .done();
+    }
+}
+
+void PassImpl::processSplit(const Model::Ptr& model, const Stage& stage) {
+    auto input = stage->input(0);
+
+    const auto& offsets = stage->attrs().get<std::vector<DimValues>>("offsets");
+    IE_ASSERT(offsets.size() == stage->numOutputs());
+
+    for (const auto& outEdge : stage->outputEdges()) {
+        IE_ASSERT(outEdge->portInd() >= 0);
+        IE_ASSERT(outEdge->portInd() < offsets.size());
+
+        auto output = outEdge->output();
+        const auto& offsetFromInput = offsets[outEdge->portInd()];
+
+        IE_ASSERT(input->desc().dimsOrder() == output->desc().dimsOrder());
+        IE_ASSERT(offsetFromInput.size() <= input->desc().numDims());
+        for (const auto& p : offsetFromInput) {
+            IE_ASSERT(input->desc().dimsOrder().hasDim(p.first));
+            IE_ASSERT(p.second + output->desc().dim(p.first) <= input->desc().dim(p.first));
+        }
+
+        //
+        // Check if we need to insert Copy stage
+        //
+
+        bool needCopy = false;
+        if (output->usage() != DataUsage::Intermediate) {
+            needCopy = true;
+        } else if (output->parentDataEdge() != nullptr) {
+            needCopy = true;
+        } else {
+            //
+            // Check output StridesRequirement.
+            //
+
+            IE_ASSERT(output->checkStrides(output->requiredStrides()));
+            if (!checkStrides(output->desc(), input->strides(), output->requiredStrides())) {
+                needCopy = true;
+            }
+
+            //
+            // Check consumers StridesRequirement.
+            //
+
+            if (!needCopy) {
+                for (const auto& consumer : output->consumers()) {
+                    auto consumerInfo = consumer->getDataStridesRequirements();
+
+                    auto strideIt = consumerInfo.find(output);
+                    if (strideIt != consumerInfo.end()) {
+                        auto consumerStrideReqs = strideIt->second;
+                        IE_ASSERT(output->checkStrides(consumerStrideReqs));
+
+                        if (!checkStrides(output->desc(), input->strides(), consumerStrideReqs)) {
+                            needCopy = true;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Insert Copy if needed
+        //
+
+        if (needCopy) {
+            auto outputCopy = model->duplicateData(
+                output,
+                "@copy");
+            outputCopy->resetRequiredStrides();
+
+            auto outPortInd = outEdge->portInd();
+
+            model->replaceStageOutput(outEdge, outputCopy);
+
+            _stageBuilder->addCopyStage(
+                model,
+                formatString("%s@output=%d@copy-for-split", stage->name(), outPortInd),
+                stage->origLayer(),
+                outputCopy,
+                output);
+
+            output = outputCopy;
+        }
+
+        //
+        // Add Data<->Data edge
+        //
+
+        model->connectDatas()
+                .parent(input)
+                .child(output)
+                .mode(SharedDataMode::ROI)
+                .order(SharedDataOrder::ParentWritesToChild)
+                .offset(offsetFromInput)
+                .done();
+    }
+}
+
+void PassImpl::processReshape(const Model::Ptr& model, const Stage& stage) {
+    auto input = stage->input(0);
+    auto output = stage->output(0);
+
+    IE_ASSERT(input->desc().dimsOrder() == DimsOrder::fromNumDims(input->desc().numDims()));
+    IE_ASSERT(input->checkStrides(StridesRequirement::compact()));
+
+    IE_ASSERT(output->desc().dimsOrder() == DimsOrder::fromNumDims(output->desc().numDims()));
+    IE_ASSERT(output->checkStrides(StridesRequirement::compact()));
+
+    //
+    // Check if we need to insert Copy stage
+    //
+
+    bool needCopy = false;
+    if (input->usage() != DataUsage::Intermediate &&
+        output->usage() != DataUsage::Intermediate) {
+        needCopy = true;
+    } else if (input->parentDataEdge() != nullptr &&
+               output->parentDataEdge() != nullptr) {
+        needCopy = true;
+    }
+
+    //
+    // Insert Copy if needed
+    //
+
+    if (needCopy) {
+        Data inputCopy;
+        if (input->usage() == DataUsage::Const) {
+            inputCopy = model->addNewData(
+                input->name() + "@copy",
+                input->desc());
+        } else {
+            inputCopy = model->duplicateData(
+                input,
+                "@copy");
+        }
+        inputCopy->updateRequiredStrides(StridesRequirement::compact());
+
+        _stageBuilder->addCopyStage(
+            model,
+            formatString("%s@copy-for-reshape", stage->name()),
+            stage->origLayer(),
+            input,
+            inputCopy);
+
+        model->replaceStageInput(stage->inputEdge(0), inputCopy);
+
+        input = inputCopy;
+    }
+
+    //
+    // Add Data<->Data edge
+    //
+
+    if (input->usage() == DataUsage::Intermediate &&
+        input->parentDataEdge() == nullptr) {
+        model->connectDatas()
+                .parent(output)
+                .child(input)
+                .mode(SharedDataMode::Reshape)
+                .order(SharedDataOrder::ChildWritesToParent)
+                .done();
+    } else {
+        IE_ASSERT(output->usage() == DataUsage::Intermediate);
+        IE_ASSERT(output->parentDataEdge() == nullptr);
+
+        model->connectDatas()
+                .parent(input)
+                .child(output)
+                .mode(SharedDataMode::Reshape)
+                .order(SharedDataOrder::ParentWritesToChild)
+                .done();
+    }
+}
+
+void PassImpl::processExpand(const Model::Ptr& model, const Stage& stage) {
+    auto input = stage->input(0);
+    auto output = stage->output(0);
+
+    const auto& offset = stage->attrs().get<DimValues>("offset");
+
+    IE_ASSERT(input->desc().dimsOrder() == output->desc().dimsOrder());
+
+    IE_ASSERT(offset.size() <= output->desc().numDims());
+    for (const auto& p : offset) {
+        IE_ASSERT(output->desc().dimsOrder().hasDim(p.first));
+        IE_ASSERT(p.second + input->desc().dim(p.first) <= output->desc().dim(p.first));
+    }
+
+    //
+    // Check if we need to insert Copy stage
+    //
+
+    bool needCopy = false;
+    bool optionalCopy = false;
+    if (input->usage() != DataUsage::Intermediate) {
+        needCopy = true;
+        optionalCopy = false;
+    } else if (input->parentDataEdge() != nullptr) {
+        needCopy = true;
+        optionalCopy = false;
+    } else {
+        //
+        // Check input StridesRequirement.
+        //
+
+        IE_ASSERT(input->checkStrides(input->requiredStrides()));
+        if (!checkStrides(input->desc(), output->strides(), input->requiredStrides())) {
+            needCopy = true;
+            optionalCopy = false;
+        }
+
+        //
+        // Check consumers StridesRequirement.
+        //
+
+        if (!needCopy) {
+            for (const auto& consumer : input->consumers()) {
+                auto consumerInfo = consumer->getDataStridesRequirements();
+
+                auto strideIt = consumerInfo.find(input);
+                if (strideIt != consumerInfo.end()) {
+                    auto consumerStrideReqs = strideIt->second;
+                    IE_ASSERT(input->checkStrides(consumerStrideReqs));
+
+                    if (!checkStrides(input->desc(), output->strides(), consumerStrideReqs)) {
+                        needCopy = true;
+                        optionalCopy = false;
+                    }
+                }
+            }
+        }
+
+        //
+        // Check producer StridesRequirement.
+        //
+
+        if (!needCopy) {
+            if (auto producer = input->producer()) {
+                auto producerInfo = producer->getDataStridesRequirements();
+
+                auto strideIt = producerInfo.find(input);
+                if (strideIt != producerInfo.end()) {
+                    auto producerStrideReqs = strideIt->second;
+                    IE_ASSERT(input->checkStrides(producerStrideReqs));
+
+                    if (!checkStrides(input->desc(), output->strides(), producerStrideReqs)) {
+                        needCopy = true;
+                        optionalCopy = false;
+                    }
+                }
+
+                if (!needCopy) {
+                    //
+                    // To reduce the size of HW output (still can be optimized).
+                    //
+
+                    if (producer->category() == StageCategory::HW) {
+                        needCopy = true;
+                        optionalCopy = true;
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Insert Copy if needed
+    //
+
+    if (needCopy) {
+        Data inputCopy;
+        if (input->usage() == DataUsage::Const) {
+            inputCopy = model->addNewData(
+                input->name() + "@copy",
+                input->desc());
+        } else {
+            inputCopy = model->duplicateData(
+                input,
+                "@copy");
+            inputCopy->resetRequiredStrides();
+        }
+
+        auto copyStage = _stageBuilder->addCopyStage(
+            model,
+            formatString("%s@copy-for-expand", stage->name()),
+            stage->origLayer(),
+            input,
+            inputCopy);
+        copyStage->attrs().set<bool>("optional", optionalCopy);
+
+        model->replaceStageInput(stage->inputEdge(0), inputCopy);
+
+        input = inputCopy;
+    }
+
+    //
+    // Add Data<->Data edge
+    //
+
+    model->connectDatas()
+            .parent(output)
+            .child(input)
+            .mode(SharedDataMode::ROI)
+            .order(SharedDataOrder::ChildWritesToParent)
+            .offset(offset)
+            .done();
+}
+
+void PassImpl::processShrink(const Model::Ptr& model, const Stage& stage) {
+    auto input = stage->input(0);
+    auto output = stage->output(0);
+
+    const auto& offset = stage->attrs().get<DimValues>("offset");
+
+    IE_ASSERT(input->desc().dimsOrder() == output->desc().dimsOrder());
+
+    IE_ASSERT(offset.size() <= input->desc().numDims());
+    for (const auto& p : offset) {
+        IE_ASSERT(input->desc().dimsOrder().hasDim(p.first));
+        IE_ASSERT(p.second + output->desc().dim(p.first) <= input->desc().dim(p.first));
+    }
+
+    //
+    // Check if we need to insert Copy for output
+    //
+
+    bool needCopy = false;
+    if (output->usage() != DataUsage::Intermediate) {
+        needCopy = true;
+    } else if (output->parentDataEdge() != nullptr) {
+        needCopy = true;
+    } else {
+        //
+        // Check output StridesRequirement.
+        //
+
+        IE_ASSERT(output->checkStrides(output->requiredStrides()));
+        if (!checkStrides(output->desc(), input->strides(), output->requiredStrides())) {
+            needCopy = true;
+        }
+
+        //
+        // Check consumers StridesRequirement.
+        //
+
+        if (!needCopy) {
+            for (const auto& consumer : output->consumers()) {
+                auto consumerInfo = consumer->getDataStridesRequirements();
+
+                auto strideIt = consumerInfo.find(output);
+                if (strideIt != consumerInfo.end()) {
+                    auto consumerStrideReqs = strideIt->second;
+                    IE_ASSERT(output->checkStrides(consumerStrideReqs));
+
+                    if (!checkStrides(output->desc(), input->strides(), consumerStrideReqs)) {
+                        needCopy = true;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Insert output Copy if needed
+    //
+
+    if (needCopy) {
+        auto outputCopy = model->duplicateData(
+            output,
+            "@copy");
+        outputCopy->resetRequiredStrides();
+
+        model->replaceStageOutput(stage->outputEdge(0), outputCopy);
+
+        _stageBuilder->addCopyStage(
+            model,
+            formatString("%s@copy-output-for-shrink", stage->name()),
+            stage->origLayer(),
+            outputCopy,
+            output);
+
+        output = outputCopy;
+    }
+
+    //
+    // Add Data<->Data edge
+    //
+
+    model->connectDatas()
+            .parent(input)
+            .child(output)
+            .mode(SharedDataMode::ROI)
+            .order(SharedDataOrder::ParentWritesToChild)
+            .offset(offset)
+            .done();
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::processSpecialStages() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/propagate_data_scale.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/propagate_data_scale.cpp
new file mode 100644 (file)
index 0000000..96ffcd4
--- /dev/null
@@ -0,0 +1,352 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <list>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+Stage StageBuilder::addScalingStage(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& origLayer,
+        float scale,
+        const Data& input,
+        const Data& output) {
+    if (input->desc().type() != DataType::FP16) {
+        VPU_THROW_EXCEPTION << "Can't adjust non-FP16 data " << input->name();
+    }
+
+    if (output->desc().type() != DataType::FP16) {
+        VPU_THROW_EXCEPTION << "Can't adjust non-FP16 data " << output->name();
+    }
+
+    if (input->desc().dimsOrder() != output->desc().dimsOrder()) {
+        VPU_THROW_EXCEPTION << input->name() << " and " << output->name() << " have different layout";
+    }
+
+    return addPowerStage(model, input->name() + "@SCALE=" + std::to_string(scale), origLayer, scale, 1.0f, 0.0f, input, output);
+}
+
+namespace {
+
+DataMap<float> getInputScales(const Stage& stage) {
+    DataMap<float> out;
+    for (const auto& input : stage->inputs()) {
+        auto scaleFactor = input->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+        out[input] = scaleFactor;
+    }
+    return out;
+}
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(propagateDataScale);
+
+    const auto& env = CompileEnv::get();
+
+    //
+    // Get required SCALE factors per data
+    //
+
+    if (env.netConfig.hasManualDataScale()) {
+        bool hasScaleReqs = false;
+
+        for (auto info : env.netConfig.dataScale()) {
+            auto name = info.first;
+            auto scale = info.second;
+
+            Data curData;
+            for (const auto& data : model->datas()) {
+                if (data->name() == name) {
+                    curData = data;
+                    break;
+                }
+            }
+            if (curData == nullptr) {
+                VPU_THROW_EXCEPTION << "There is no such data : " << name << " in network " << model->name();
+            }
+
+            if (curData->usage() != DataUsage::Input &&
+                curData->usage() != DataUsage::Intermediate) {
+                VPU_THROW_EXCEPTION
+                        << "Scale can be used only for input and intermediate data, got "
+                        << curData->name() << " as " << curData->usage();
+            }
+
+            if (curData->attrs().has("requestedScale")) {
+                VPU_THROW_EXCEPTION << "Data " << name << " is mentioned twice";
+            }
+            if (!isFloatEqual(scale, 1.0f)) {
+                hasScaleReqs = true;
+                curData->attrs().set<float>("requestedScale", scale);
+            }
+        }
+
+        if (!hasScaleReqs) {
+            return;
+        }
+    } else {
+        auto inputShift = model->attrs().getOrDefault<int>("inputShift", 1);
+        if (inputShift == 1) {
+            return;
+        }
+
+        float inputScale =  1 << inputShift;
+        for (const auto& data : model->datas()) {
+            if (data->usage() != DataUsage::Input)
+                continue;
+
+            data->attrs().set<float>("requestedScale", inputScale);
+        }
+    }
+
+    //
+    // Traverse stages
+    //
+    // - add SCALE for input if required
+    // - propagate SCALE to next stages if possible
+    // - undo SCALE if the stage doesn't support it
+    //
+
+    for (const auto& stage : model->getStages()) {
+        //
+        // Check if we need to add SCALE to input
+        //
+
+        bool scalesWasInitialized = false;
+
+        for (const auto& inEdge : stage->inputEdges()) {
+            auto input = inEdge->input();
+
+            if (!input->attrs().has("requestedScale")) {
+                // No SCALE requested.
+                continue;
+            }
+
+            auto requestedScale = input->attrs().get<float>("requestedScale");
+            auto curScaleFactor = input->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+
+            if (isFloatEqual(curScaleFactor, requestedScale)) {
+                // We already added SCALE to this data.
+                continue;
+            }
+
+            auto scaleMultiplier = requestedScale / curScaleFactor;
+
+            //
+            // Some stages can SCALE input internally, check them first
+            //
+
+            if (input->numConsumers() == 1) {
+                auto inputScales = getInputScales(stage);
+                inputScales[input] = scaleMultiplier;
+
+                auto checkScales = stage->propagateScaleFactors(inputScales, ScalePropagationStep::Check);
+                if (checkScales.count(input) == 0) {
+                    auto finalScales = stage->propagateScaleFactors(inputScales, ScalePropagationStep::ScaleInput);
+
+                    for (const auto& constInEdge : stage->inputEdges()) {
+                        auto constInput = constInEdge->input();
+
+                        auto it = finalScales.find(constInput);
+                        if (it == finalScales.end())
+                            continue;
+
+                        auto curScaleFactor = constInput->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+                        auto newScaleFactor = it->second;
+
+                        if (isFloatEqual(curScaleFactor, newScaleFactor))
+                            continue;
+
+                        IE_ASSERT(constInput->usage() == DataUsage::Const);
+
+                        auto scaleCoeff = newScaleFactor / curScaleFactor;
+
+                        auto& scaledChildren = constInput->attrs().getOrSet<DataVector>("scaledChildren", DataVector());
+
+                        Data scaledConstInput;
+                        for (const auto& scaledChild : scaledChildren) {
+                            auto childScaleFactor = scaledChild->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+                            if (isFloatEqual(childScaleFactor, newScaleFactor)) {
+                                scaledConstInput = scaledChild;
+                                break;
+                            }
+                        }
+                        if (scaledConstInput == nullptr) {
+                            scaledConstInput = model->duplicateData(
+                                constInput,
+                                formatString("@SCALE=%f", scaleCoeff),
+                                constInput->desc(),
+                                scaleContent(constInput->content(), scaleCoeff));
+
+                            scaledChildren.emplace_back(scaledConstInput);
+                        }
+
+                        model->replaceStageInput(constInEdge, scaledConstInput);
+
+                        scaledConstInput->attrs().set<float>("scaleFactor", newScaleFactor);
+                    }
+                    for (const auto& output : stage->outputs()) {
+                        output->attrs().set<float>("scaleFactor", finalScales.at(output));
+                    }
+
+                    scalesWasInitialized = true;
+                    break;
+                }
+            }
+
+            //
+            // Add explicit scaling stage
+            //
+
+            auto newInput = model->duplicateData(
+                input,
+                formatString("@SCALE=%f", requestedScale));
+
+            newInput->attrs().set<float>("scaleFactor", requestedScale);
+
+            for (const auto& consumerEdge : input->consumerEdges()) {
+                model->replaceStageInput(consumerEdge, newInput);
+            }
+
+            _stageBuilder->addScalingStage(model, stage->origLayer(), scaleMultiplier, input, newInput);
+        }
+
+        if (scalesWasInitialized)
+            continue;
+
+        //
+        // Propagate SCALE from inputs to outputs
+        //
+
+        auto finalScales = stage->propagateScaleFactors(getInputScales(stage), ScalePropagationStep::Propagate);
+
+        for (const auto& inputEdge : stage->inputEdges()) {
+            auto input = inputEdge->input();
+
+            auto it = finalScales.find(input);
+            if (it == finalScales.end())
+                continue;
+
+            auto curScaleFactor = input->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+            auto newScaleFactor = it->second;
+
+            if (isFloatEqual(curScaleFactor, newScaleFactor))
+                continue;
+
+            auto scaleCoeff = newScaleFactor / curScaleFactor;
+
+            Data scaledInput;
+            if (input->usage() == DataUsage::Const) {
+                auto& scaledChildren = input->attrs().getOrSet<DataVector>("scaledChildren", DataVector());
+
+                for (const auto& scaledChild : scaledChildren) {
+                    auto childScaleFactor = scaledChild->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+                    if (isFloatEqual(childScaleFactor, newScaleFactor)) {
+                        scaledInput = scaledChild;
+                        break;
+                    }
+                }
+
+                if (scaledInput == nullptr) {
+                    scaledInput = model->duplicateData(
+                        input,
+                        formatString("@SCALE=%f", scaleCoeff),
+                        input->desc(),
+                        scaleContent(input->content(), scaleCoeff));
+
+                    scaledChildren.emplace_back(scaledInput);
+                }
+            } else {
+                scaledInput = model->duplicateData(
+                    input,
+                    formatString("@SCALE=%f", scaleCoeff));
+
+                _stageBuilder->addScalingStage(model, stage->origLayer(), scaleCoeff, input, scaledInput);
+            }
+            IE_ASSERT(scaledInput != nullptr);
+
+            model->replaceStageInput(inputEdge, scaledInput);
+
+            scaledInput->attrs().set<float>("scaleFactor", newScaleFactor);
+        }
+        for (const auto& output : stage->outputs()) {
+            output->attrs().set<float>("scaleFactor", finalScales.at(output));
+        }
+    }
+
+    //
+    // Remove SCALE from network outputs
+    //
+
+    for (auto output : model->datas()) {
+        if (output->usage() != DataUsage::Output) {
+            continue;
+        }
+
+        auto outputScale = output->attrs().getOrDefault<float>("scaleFactor", 1.0f);
+        if (isFloatEqual(outputScale, 1.0f)) {
+            continue;
+        }
+
+        if (output->desc().type() != DataType::FP16) {
+            output = output->attrs().get<Data>("fp16_copy");
+            IE_ASSERT(output != nullptr);
+            IE_ASSERT(output->desc().type() == DataType::FP16);
+        }
+
+        auto newData = model->duplicateData(
+            output,
+            formatString("@SCALE=%f", outputScale));
+
+        newData->attrs().set<float>("scaleFactor", outputScale);
+        output->attrs().set<float>("scaleFactor", 1.0f);
+
+        auto producerEdge = output->producerEdge();
+        IE_ASSERT(producerEdge != nullptr);
+        model->replaceStageOutput(producerEdge, newData);
+
+        IE_ASSERT(output->numConsumers() == 0);
+
+        _stageBuilder->addScalingStage(
+            model,
+            nullptr,
+            1.0f / outputScale,
+            newData,
+            output);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::propagateDataScale() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/replace_deconv_by_conv.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/replace_deconv_by_conv.cpp
new file mode 100644 (file)
index 0000000..abbd76f
--- /dev/null
@@ -0,0 +1,285 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <cmath>
+#include <list>
+#include <set>
+#include <unordered_map>
+#include <memory>
+
+#include <vpu/stub_stage.hpp>
+#include <vpu/sw/utility.hpp>
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+using ReplicatedDataMap = std::unordered_map<int, Data>;
+
+class UpsamplingStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<UpsamplingStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input] = DimsOrder::NCHW;
+        out[output] = DimsOrder::NCHW;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[output] = StridesRequirement().add(1, DimStride::Aligned);
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::TwoOrOne;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto scaleX = attrs().get<int>("upsampling_factorx_x");
+        auto scaleY = attrs().get<int>("upsampling_factorx_y");
+        auto scaleZ = attrs().get<int>("upsampling_factorx_z");
+        auto pad_l_x = attrs().get<int>("pad_l_x");
+        auto pad_r_x = attrs().get<int>("pad_r_x");
+        auto pad_l_y = attrs().get<int>("pad_l_y");
+        auto pad_r_y = attrs().get<int>("pad_r_y");
+        auto pad_l_z = attrs().get<int>("pad_l_z");
+        auto pad_r_z = attrs().get<int>("pad_r_z");
+
+        serializer.append(static_cast<int32_t>(scaleX));
+        serializer.append(static_cast<int32_t>(scaleY));
+        serializer.append(static_cast<int32_t>(scaleZ));
+        serializer.append(static_cast<int32_t>(pad_l_x));
+        serializer.append(static_cast<int32_t>(pad_r_x));
+        serializer.append(static_cast<int32_t>(pad_l_y));
+        serializer.append(static_cast<int32_t>(pad_r_y));
+        serializer.append(static_cast<int32_t>(pad_l_z));
+        serializer.append(static_cast<int32_t>(pad_r_z));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+
+class DeconvolutionToConvolutionContent final : public CalculatedDataContent {
+public:
+    DeconvolutionToConvolutionContent(
+            const DataContent::Ptr& origContent,
+            int kernelSizeX, int kernelSizeY) :
+            CalculatedDataContent({origContent}),
+            _kerneSizeX(kernelSizeX), _kernelSizeY(kernelSizeY) {
+    }
+
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
+        VPU_PROFILE(DeconvolutionToConvolutionContent);
+
+        IE_ASSERT(baseContents.size() == 1);
+        IE_ASSERT(_desc.type() == DataType::FP16);
+
+        deconv_to_conv(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+    }
+
+private:
+    int _kerneSizeX;
+    int _kernelSizeY;
+};
+
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(replaceDeconvByConv);
+
+    auto stages = model->getStages();
+    for (const auto& stage : stages) {
+        if (stage->type() != StageType::StubDeconv) {
+            continue;
+        }
+
+        auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
+        auto groupSize = stage->attrs().get<int>("groupSize");
+
+        auto padLeft  = stage->attrs().get<int>("padLeft");
+        auto padRight = stage->attrs().get<int>("padRight");
+        auto padTop = stage->attrs().get<int>("padTop");
+        auto padBottom = stage->attrs().get<int>("padBottom");
+        auto deconvScale = stage->attrs().getOrDefault<float>("scaleFactor", 1.0);
+
+        /* Upsampling layer does not support negative paddings */
+        if ((kernelSizeX - 1 - padLeft < 0) || (kernelSizeX - 1 - padRight < 0) ||
+            (kernelSizeY - 1 - padTop < 0) || (kernelSizeY - 1 - padBottom < 0)) {
+            continue;
+        }
+
+        if (groupSize != 1) {
+            continue;
+        }
+
+        if ((padTop != padBottom) || (padLeft != padRight)) {
+            continue;
+        }
+
+        if (kernelSizeX > 15 || kernelSizeY > 15) {
+            continue;
+        }
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases  = stage->input(2);
+        auto output = stage->output(0);
+        const auto& env = CompileEnv::get();
+
+        if (env.netConfig.hwDisabled(stage->origLayer()->name)) {
+            continue;
+        }
+
+        if (output->desc().numDims() < 4) {
+            continue;
+        }
+
+        // problem with Deconv/CommonSingleLayerTest
+        auto origOutputX = kernelStrideX * (input->desc().dim(Dim::W)  - 1) + kernelSizeX - padLeft - padRight;
+        auto origOutputY = kernelStrideY * (input->desc().dim(Dim::H)  - 1) + kernelSizeY - padTop - padBottom;
+
+        if ((origOutputX != output->desc().dim(Dim::W)) || (origOutputY != output->desc().dim(Dim::H))) {
+            continue;
+        }
+
+        model->disconnectStageDatas(stage);
+
+        DataDesc newDesc({1, 1, output->desc().dim(Dim::C), output->desc().dim(Dim::N)});
+        newDesc.setDim(Dim::N, 1);
+        newDesc.setDim(Dim::C, input->desc().dim(Dim::C));
+        newDesc.setDim(Dim::H, (input->desc().dim(Dim::H) - 1) * kernelStrideY + 1 + (kernelSizeY - 1) * 2 - padTop - padBottom);
+        newDesc.setDim(Dim::W, (input->desc().dim(Dim::W) - 1) * kernelStrideX + 1 + (kernelSizeX - 1) * 2 - padLeft - padRight);
+
+        auto newOutput = model->duplicateData(output, "@upsampleData", newDesc);
+        auto newWeights = model->duplicateData(weights, "@upsampleData", weights->desc(),
+                     std::make_shared<DeconvolutionToConvolutionContent>(weights->content(), kernelSizeX, kernelSizeY));
+
+        auto upsampleStage = model->addNewStage<UpsamplingStage>(
+                stage->origLayerName() + "@Upsample",
+                StageType::Upsampling,
+                stage->origLayer(),
+                {input},
+                {newOutput});
+
+        upsampleStage->attrs().set<int>("upsampling_factorx_x", kernelStrideX);
+        upsampleStage->attrs().set<int>("upsampling_factorx_y", kernelStrideY);
+        upsampleStage->attrs().set<int>("upsampling_factorx_z", 1);
+        upsampleStage->attrs().set<int>("pad_l_x", (kernelSizeX - 1) - padLeft);
+        upsampleStage->attrs().set<int>("pad_r_x", (kernelSizeX - 1) - padRight);
+        upsampleStage->attrs().set<int>("pad_l_y", (kernelSizeY - 1) - padTop);
+        upsampleStage->attrs().set<int>("pad_r_y", (kernelSizeY - 1) - padBottom);
+        upsampleStage->attrs().set<int>("pad_l_z", 0);
+        upsampleStage->attrs().set<int>("pad_r_z", 0);
+
+        auto newStage = model->addNewStage<StubStage>(
+                stage->origLayerName() + "@UpsampleConv",
+                StageType::StubConv,
+                stage->origLayer(),
+                {newOutput, newWeights, biases},
+                {output});
+
+        newStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+        newStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+        newStage->attrs().set<int>("kernelStrideX", 1);
+        newStage->attrs().set<int>("kernelStrideY", 1);
+        newStage->attrs().set<int>("padLeft", 0);
+        newStage->attrs().set<int>("padRight", 0);
+        newStage->attrs().set<int>("padTop", 0);
+        newStage->attrs().set<int>("padBottom", 0);
+        newStage->attrs().set<int>("dilationX", 1);
+        newStage->attrs().set<int>("dilationY", 1);
+        newStage->attrs().set<int>("groupSize", 1);
+        newStage->attrs().set<bool>("tryHW", true);
+        newStage->attrs().set<float>("scaleFactor", deconvScale);
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::replaceDeconvByConv() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/replace_fc_by_conv.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/replace_fc_by_conv.cpp
new file mode 100644 (file)
index 0000000..b2e7431
--- /dev/null
@@ -0,0 +1,192 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <cmath>
+#include <list>
+#include <set>
+#include <unordered_map>
+#include <memory>
+
+#include <vpu/stub_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+using ReplicatedDataMap = std::unordered_map<int, Data>;
+
+void setConvParameters(const vpu::Stage& stage, int kX, int kY) {
+    stage->attrs().set<int>("kernelSizeX", kX);
+    stage->attrs().set<int>("kernelSizeY", kY);
+    stage->attrs().set<int>("kernelStrideX", 1);
+    stage->attrs().set<int>("kernelStrideY", 1);
+    stage->attrs().set<int>("padLeft", 0);
+    stage->attrs().set<int>("padRight", 0);
+    stage->attrs().set<int>("padTop", 0);
+    stage->attrs().set<int>("padBottom", 0);
+    stage->attrs().set<int>("dilationX", 1);
+    stage->attrs().set<int>("dilationY", 1);
+    stage->attrs().set<int>("groupSize", 1);
+    stage->attrs().set<bool>("tryHW", true);
+}
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(replaceFCbyConv);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubFullyConnected) {
+            continue;
+        }
+
+        auto tryHW = stage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases  = stage->input(2);
+        auto output = stage->output(0);
+
+        auto dims = input->desc().dims();
+
+        if (input->desc().numDims() == 4) {
+            bool required = dims.has(Dim::N);
+            required &= dims.has(Dim::C);
+            required &= dims.has(Dim::H);
+            required &= dims.has(Dim::W);
+
+            if (required &&
+                input->desc().dim(Dim::H, 1) < 16 &&
+                input->desc().dim(Dim::W, 1) < 16) {
+                /* can convert to convolution layers */
+                model->disconnectStageDatas(stage);
+
+                auto kernelSizeX = input->desc().dim(Dim::W, 1);
+                auto kernelSizeY = input->desc().dim(Dim::H, 1);
+                IE_ASSERT(weights->desc().totalDimSize() >=
+                        kernelSizeX * kernelSizeY * (input->desc().dim(Dim::C)) * output->desc().dim(Dim::C));
+
+                auto newWeights = model->duplicateData(
+                    weights,
+                    "",
+                    DataDesc({
+                        kernelSizeX,
+                        kernelSizeY,
+                        input->desc().dim(Dim::C),
+                        output->desc().dim(Dim::C)}));
+
+                auto newBiases = model->addFakeData();
+                if (biases->usage() != DataUsage::Fake) {
+                    IE_ASSERT(biases->desc().totalDimSize() >= output->desc().dim(Dim::C));
+                    newBiases = model->duplicateData(biases,
+                        biases->name(),
+                        DataDesc({output->desc().dim(Dim::C)}));
+                }
+
+                DataDesc newDesc({1, 1, output->desc().dim(Dim::C), output->desc().dim(Dim::N)});
+                auto newOutput = model->duplicateData(output, "@reshapeData", newDesc);
+
+                auto newStage = model->addNewStage<StubStage>(
+                    stage->origLayerName(),
+                    StageType::StubConv,
+                    stage->origLayer(),
+                    {input, newWeights, newBiases},
+                    {newOutput});
+                newStage->attrs().copyFrom(stage->attrs());
+                setConvParameters(newStage, kernelSizeX, kernelSizeY);
+
+                _stageBuilder->addReshapeStage(
+                    model,
+                    stage->name() + "@reshapeOut",
+                    stage->origLayer(),
+                    newOutput,
+                    output);
+
+                model->removeStage(stage);
+            }
+        } else if (dims.has(Dim::N) &&
+                   dims.has(Dim::C) &&
+                   (!dims.has(Dim::H)) &&
+                   (!dims.has(Dim::W))) {
+            IE_ASSERT(weights->desc().totalDimSize() >=
+                    (input->desc().dim(Dim::C)) * output->desc().dim(Dim::C));
+
+            model->disconnectStageDatas(stage);
+
+            auto newWeights = model->duplicateData(weights,
+                weights->name(),
+                DataDesc({
+                    1,
+                    1,
+                    input->desc().dim(Dim::C),
+                    output->desc().dim(Dim::C)}));
+
+            auto newBiases =  model->addFakeData();
+            if (biases->usage() != DataUsage::Fake) {
+                IE_ASSERT(biases->desc().totalDimSize() >= output->desc().dim(Dim::C));
+                newBiases = model->duplicateData(biases,
+                                                  biases->name(),
+                                                  DataDesc({output->desc().dim(Dim::C)}));
+            }
+
+            DataDesc newDescIn({1, 1, input->desc().dim(Dim::C), input->desc().dim(Dim::N)});
+            auto newInput = model->duplicateData(output, "@reshapeDataIn", newDescIn);
+
+            DataDesc newDescOut({1, 1, output->desc().dim(Dim::C), output->desc().dim(Dim::N)});
+            auto newOutput = model->duplicateData(output, "@reshapeDataOut", newDescOut);
+
+            _stageBuilder->addReshapeStage(
+                model,
+                stage->name() + "@reshapeIn",
+                stage->origLayer(),
+                input,
+                newInput);
+
+            auto newStage = model->addNewStage<StubStage>(
+                stage->origLayerName(),
+                StageType::StubConv,
+                stage->origLayer(),
+                {newInput, newWeights, newBiases},
+                {newOutput});
+            newStage->attrs().copyFrom(stage->attrs());
+            setConvParameters(newStage, 1, 1);
+
+            _stageBuilder->addReshapeStage(
+                model,
+                stage->name() + "@reshapeOut",
+                stage->origLayer(),
+                newOutput,
+                output);
+
+            model->removeStage(stage);
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::replaceFCbyConv() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/split_grouped_conv.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/split_grouped_conv.cpp
new file mode 100644 (file)
index 0000000..ac5039f
--- /dev/null
@@ -0,0 +1,223 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <set>
+#include <memory>
+
+#include <precision_utils.h>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+void deconvolutionRelayout(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int IC, int OC,
+        int g, int GR) {
+    for (int goc = 0; goc < OC / GR; ++goc) {
+        for (int gic = 0; gic < IC / GR; ++gic) {
+            for (int ky = 0; ky < KY; ++ky) {
+                for (int kx = 0; kx < KX; ++kx) {
+                    int iidx =
+                        gic * OC * KY * KX +
+                        (g * OC / GR + goc) * KY * KX +
+                        ky * KX +
+                        kx;
+                    IE_ASSERT(iidx < src_size);
+
+                    int oidx =
+                        gic * (OC / GR) * KY * KX +
+                        goc * KY * KX +
+                        ky * KX +
+                        kx;
+                    IE_ASSERT(oidx < dst_size);
+
+                    dst[oidx] = src[iidx];
+                }
+            }
+        }
+    }
+}
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(splitGroupedConv);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubConv &&
+            stage->type() != StageType::StubDeconv) {
+            continue;
+        }
+
+        IE_ASSERT(stage->numInputs() == 3);
+        IE_ASSERT(stage->numOutputs() == 1);
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases = stage->input(2);
+        auto output = stage->output(0);
+
+        auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+        auto groupSize = stage->attrs().get<int>("groupSize");
+
+        if (groupSize == 1) {
+            continue;
+        }
+
+        if (groupSize == input->desc().dim(Dim::C) &&
+            groupSize == output->desc().dim(Dim::C)) {
+            // It is a Depth[De]Convolution, it is handled separately for SW and HW
+            continue;
+        }
+
+        model->disconnectStageDatas(stage);
+
+        auto inGroupDimC = input->desc().dim(Dim::C) / groupSize;
+        auto outGroupDimC = output->desc().dim(Dim::C) / groupSize;
+
+        DataVector subInputs(groupSize);
+        DataVector subOutputs(groupSize);
+
+        for (int groupInd = 0; groupInd < groupSize; ++groupInd) {
+            auto postfix = formatString("@group=%d/%d", groupInd + 1, groupSize);
+
+            // subInput
+
+            auto subInputDesc = input->desc();
+            subInputDesc.setDim(Dim::C, inGroupDimC);
+
+            subInputs[groupInd] = model->duplicateData(
+                input,
+                postfix,
+                subInputDesc);
+
+            // subWeights
+
+            Data subWeights;
+            {
+                auto content = weights->content();
+                IE_ASSERT(content != nullptr);
+
+                auto origWeights = content->get<fp16_t>();
+                IE_ASSERT(origWeights != nullptr);
+
+                auto kernWxH = kernelSizeX * kernelSizeY;
+                size_t newWeightsSize = kernWxH * inGroupDimC * outGroupDimC;
+
+                auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {newWeightsSize});
+                newWeightsBlob->allocate();
+
+                auto newWeightsPtr = newWeightsBlob->buffer().as<fp16_t*>();
+
+                std::fill_n(newWeightsPtr, newWeightsSize, ie::PrecisionUtils::f32tof16(0.0f));
+
+                if (stage->type() == StageType::StubDeconv) {
+                    deconvolutionRelayout(
+                        origWeights, weights->desc().totalDimSize(),
+                        newWeightsPtr, newWeightsSize,
+                        kernelSizeX, kernelSizeY,
+                        input->desc().dim(Dim::C),
+                        output->desc().dim(Dim::C),
+                        groupInd, groupSize);
+                } else {
+                    std::copy_n(origWeights + newWeightsSize * groupInd, newWeightsSize, newWeightsPtr);
+                }
+
+                subWeights = model->duplicateData(
+                    weights,
+                    postfix,
+                    DataDesc({kernelSizeX, kernelSizeY, inGroupDimC, outGroupDimC}),
+                    ieBlobContent(newWeightsBlob));
+            }
+
+            // subBiases
+
+            auto subBiases = biases;
+            if (biases->usage() != DataUsage::Fake) {
+                auto content = biases->content();
+                IE_ASSERT(content != nullptr);
+
+                auto origBiases = content->get<fp16_t>();
+                IE_ASSERT(origBiases != nullptr);
+
+                auto newBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(outGroupDimC)});
+                newBiasesBlob->allocate();
+
+                auto newBiasesPtr = newBiasesBlob->buffer().as<fp16_t*>();
+
+                std::copy_n(origBiases + groupInd * outGroupDimC, outGroupDimC, newBiasesPtr);
+
+                subBiases = model->duplicateData(
+                    biases,
+                    postfix,
+                    DataDesc({outGroupDimC}),
+                    ieBlobContent(newBiasesBlob));
+            }
+
+            // subOutput
+
+            auto subOutputDesc = output->desc();
+            subOutputDesc.setDim(Dim::C, outGroupDimC);
+
+            subOutputs[groupInd] = model->duplicateData(
+                output,
+                postfix,
+                subOutputDesc);
+
+            // subConvStage
+
+            auto subConvStage = model->duplicateStage(
+                stage->name() + postfix,
+                stage,
+                {subInputs[groupInd], subWeights, subBiases},
+                {subOutputs[groupInd]});
+
+            subConvStage->attrs().set<int>("groupSize", 1);
+            subConvStage->attrs().set<int>("groupInd", groupInd);
+        }
+
+        _stageBuilder->addSplitStage(
+            model,
+            stage->name() + "@split",
+            stage->origLayer(),
+            Dim::C,
+            input,
+            subInputs);
+
+        _stageBuilder->addConcatStage(
+            model,
+            stage->name() + "@concat",
+            stage->origLayer(),
+            Dim::C,
+            subOutputs,
+            output);
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::splitGroupedConv() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/split_hw_conv_and_pool.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/split_hw_conv_and_pool.cpp
new file mode 100644 (file)
index 0000000..1d06d39
--- /dev/null
@@ -0,0 +1,219 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <set>
+#include <memory>
+#include <array>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(splitHwConvAndPool);
+
+    const auto& env = CompileEnv::get();
+
+    for (const auto& convStage : model->getStages()) {
+        if (convStage == nullptr) {
+            continue;
+        }
+
+        if (convStage->type() != StageType::StubConv) {
+            continue;
+        }
+
+        auto convHW = convStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!convHW) {
+            continue;
+        }
+
+        auto convInput = convStage->input(0);
+        auto convWeights = convStage->input(1);
+        auto convBiases = convStage->input(2);
+        auto convOutput = convStage->output(0);
+
+        if (convOutput->usage() != DataUsage::Intermediate) {
+            continue;
+        }
+
+        // TODO : better estimation?
+        auto outBufSize = calculateHwBufferSize(convOutput->desc().dims());
+        if (outBufSize <= env.resources.cmxLimit) {
+            continue;
+        }
+
+        if (convOutput->numConsumers() != 1) {
+            continue;
+        }
+
+        auto poolStage = convOutput->singleConsumer();
+        if (poolStage->type() != StageType::StubAvgPool &&
+            poolStage->type() != StageType::StubMaxPool) {
+            continue;
+        }
+
+        auto poolHW = poolStage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!poolHW) {
+            continue;
+        }
+
+        auto convKernelSizeX = convStage->attrs().get<int>("kernelSizeX");
+        auto convKernelSizeY = convStage->attrs().get<int>("kernelSizeY");
+
+        auto poolOutput = poolStage->output(0);
+
+        // TODO : better estimation?
+        int tileSize = 0u;
+        std::array<int, 3> TILE_SIZE_CANDIDATES{{128, 64, 32}};
+        for (auto curTileSize : TILE_SIZE_CANDIDATES) {
+            if (convOutput->desc().dim(Dim::C) >= curTileSize &&
+                convOutput->desc().dim(Dim::C) % curTileSize == 0) {
+                DimValues curOutDims;
+                curOutDims.set(Dim::W, convOutput->desc().dim(Dim::W));
+                curOutDims.set(Dim::H, convOutput->desc().dim(Dim::H));
+                curOutDims.set(Dim::C, curTileSize);
+
+                auto curOutBufSize = calculateHwBufferSize(curOutDims);
+                if (curOutBufSize <= env.resources.cmxLimit) {
+                    tileSize = curTileSize;
+                    break;
+                }
+            }
+        }
+
+        if (tileSize == 0)
+            continue;
+
+        auto numTiles = (convOutput->desc().dim(Dim::C) + tileSize - 1) / tileSize;
+
+        model->disconnectStageDatas(convStage);
+        model->disconnectStageDatas(poolStage);
+
+        DataVector subOutputs(numTiles);
+
+        int tileOffset = 0;
+        for (int tileInd = 0; tileInd < numTiles; ++tileInd) {
+            auto postfix = formatString("@tile=%d/%d", tileInd + 1, numTiles);
+
+            auto curTileSize = tileInd != numTiles - 1 ? tileSize : convOutput->desc().dim(Dim::C) - tileOffset;
+
+            auto convOutputTileDesc = convOutput->desc();
+            convOutputTileDesc.setDim(Dim::C, curTileSize);
+
+            auto convOutputTile = model->duplicateData(
+                convOutput,
+                postfix,
+                convOutputTileDesc);
+
+            auto poolOutputTileDesc = poolOutput->desc();
+            poolOutputTileDesc.setDim(Dim::C, curTileSize);
+
+            auto poolOutputTile = model->duplicateData(
+                poolOutput,
+                postfix,
+                poolOutputTileDesc);
+
+            Data tileWeights;
+            {
+                auto content = convWeights->content();
+                IE_ASSERT(content != nullptr);
+
+                auto origWeights = content->get<fp16_t>();
+                IE_ASSERT(origWeights != nullptr);
+
+                auto kernWxH = convKernelSizeX * convKernelSizeY;
+                size_t newWeightsSize = kernWxH * convInput->desc().dim(Dim::C) * tileSize;
+
+                auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {newWeightsSize});
+                newWeightsBlob->allocate();
+
+                auto inPtr = origWeights + kernWxH * convInput->desc().dim(Dim::C) * tileInd * tileSize;
+                auto outPtr = newWeightsBlob->buffer().as<fp16_t*>();
+
+                std::copy_n(inPtr, newWeightsSize, outPtr);
+
+                tileWeights = model->duplicateData(
+                    convWeights,
+                    postfix,
+                    DataDesc({convKernelSizeX, convKernelSizeY, convInput->desc().dim(Dim::C), tileSize}),
+                    ieBlobContent(newWeightsBlob));
+            }
+
+            auto tileBiases = convBiases;
+            if (convBiases->usage() != DataUsage::Fake) {
+                auto content = convBiases->content();
+                IE_ASSERT(content != nullptr);
+
+                auto origBiases = content->get<fp16_t>();
+                IE_ASSERT(origBiases != nullptr);
+
+                auto newBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(tileSize)});
+                newBiasesBlob->allocate();
+
+                auto inPtr = origBiases + tileInd * tileSize;
+                auto outPtr = newBiasesBlob->buffer().as<fp16_t*>();
+
+                std::copy_n(inPtr, tileSize, outPtr);
+
+                tileBiases = model->duplicateData(
+                    convBiases,
+                    postfix,
+                    DataDesc({tileSize}),
+                    ieBlobContent(newBiasesBlob));
+            }
+
+            model->duplicateStage(
+                convStage->name() + postfix,
+                convStage,
+                {convInput, tileWeights, tileBiases},
+                {convOutputTile});
+
+            model->duplicateStage(
+                poolStage->name() + postfix,
+                poolStage,
+                {convOutputTile},
+                {poolOutputTile});
+
+            subOutputs[tileInd] = poolOutputTile;
+
+            tileOffset += curTileSize;
+        }
+
+        _stageBuilder->addConcatStage(
+            model,
+            poolStage->name() + "@concat",
+            poolStage->origLayer(),
+            Dim::C,
+            subOutputs,
+            poolOutput);
+
+        model->removeStage(convStage);
+        model->removeStage(poolStage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::splitHwConvAndPool() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/split_hw_depth_convolution.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/split_hw_depth_convolution.cpp
new file mode 100644 (file)
index 0000000..e9e1055
--- /dev/null
@@ -0,0 +1,270 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <memory>
+#include <array>
+#include <string>
+#include <list>
+#include <unordered_set>
+#include <vector>
+#include <set>
+#include <tuple>
+#include <limits>
+
+#include <precision_utils.h>
+
+#include <vpu/hw/tiling.hpp>
+#include <vpu/hw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+std::tuple<Data, Data> createWeigthsAndBiasesForDepthConv(
+        const Model::Ptr& model,
+        const Data& origWeights,
+        const Data& origBiases,
+        const Stage& origStage,
+        int tileSize, int tileOffset,
+        const std::string& postfix) {
+    auto kernelSizeX = origStage->attrs().get<int>("kernelSizeX");
+    auto kernelSizeY = origStage->attrs().get<int>("kernelSizeY");
+
+    Data newWeights;
+    {
+        auto content = origWeights->content();
+        IE_ASSERT(content != nullptr);
+
+        auto origWeightsVals = content->get<fp16_t>();
+        IE_ASSERT(origWeightsVals != nullptr);
+
+        size_t newWeightsSize = kernelSizeX * kernelSizeY * tileSize * tileSize;
+
+        auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {newWeightsSize});
+        newWeightsBlob->allocate();
+
+        auto inPtr = origWeightsVals + kernelSizeX * kernelSizeY * tileOffset;
+        auto outPtr = newWeightsBlob->buffer().as<fp16_t*>();
+
+        std::fill_n(outPtr, newWeightsSize, ie::PrecisionUtils::f32tof16(0.0f));
+
+        for (int idx = 0; idx < tileSize; ++idx) {
+            auto srcSlicePtr = inPtr + idx * kernelSizeX * kernelSizeY;
+            auto dstSlicePtr = outPtr + idx * (kernelSizeX * kernelSizeY) * (tileSize + 1);
+            std::copy_n(srcSlicePtr, kernelSizeX * kernelSizeY, dstSlicePtr);
+        }
+
+        newWeights = model->duplicateData(
+            origWeights,
+            postfix,
+            DataDesc({kernelSizeX, kernelSizeY, tileSize, tileSize}),
+            ieBlobContent(newWeightsBlob));
+    }
+
+    auto newBiases = origBiases;
+    if (origBiases->usage() != DataUsage::Fake) {
+        auto content = origBiases->content();
+        IE_ASSERT(content != nullptr);
+
+        auto origBiasesVals = content->get<fp16_t>();
+        IE_ASSERT(origBiasesVals != nullptr);
+
+        auto newBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(tileSize)});
+        newBiasesBlob->allocate();
+
+        auto inPtr = origBiasesVals + tileOffset;
+        auto outPtr = newBiasesBlob->buffer().as<fp16_t*>();
+
+        std::copy_n(inPtr, tileSize, outPtr);
+
+        newBiases = model->duplicateData(
+            origBiases,
+            postfix,
+            DataDesc({tileSize}),
+            ieBlobContent(newBiasesBlob));
+    }
+
+    return std::make_tuple(newWeights, newBiases);
+}
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(splitHwDepthConv);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubConv) {
+            continue;
+        }
+
+        auto tryHW = stage->attrs().getOrDefault<bool>("tryHW", false);
+        if (!tryHW) {
+            continue;
+        }
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases = stage->input(2);
+        auto output = stage->output(0);
+
+        if (input->desc().dim(Dim::C) != output->desc().dim(Dim::C)) {
+            continue;
+        }
+
+        auto groupSize = stage->attrs().get<int>("groupSize");
+        if (groupSize != input->desc().dim(Dim::C)) {
+            continue;
+        }
+
+        //
+        // Collect cost per tile
+        //
+
+        using OptTile = std::tuple<int, double>;
+        std::vector<OptTile> optTiles;
+        optTiles.reserve(output->desc().dim(Dim::C));
+
+        for (int curTileSize = 1; curTileSize < output->desc().dim(Dim::C); curTileSize++) {
+            auto tileInfo = splitHwConvIntoOutChannelsTiles(
+                    input->desc().dim(Dim::W), input->desc().dim(Dim::H), curTileSize,
+                    curTileSize,
+                    stage->attrs().get<int>("kernelSizeX"), stage->attrs().get<int>("kernelSizeY"),
+                    stage->attrs().get<int>("kernelStrideX"));
+
+            if (tileInfo.numDescr > 0) {
+                auto curNumTiles = divUp(output->desc().dim(Dim::C), curTileSize);
+                optTiles.emplace_back(std::make_tuple(curTileSize, tileInfo.cost * curNumTiles));
+            }
+        }
+
+        //
+        // Choose tile with minimal cost
+        //
+
+        auto tileSize = output->desc().dim(Dim::C);
+        auto numTiles = 1;
+
+        // TODO: switch to SW?
+        if (!optTiles.empty()) {
+            // Sort by cost.
+            std::stable_sort(optTiles.begin(), optTiles.end(),
+                [](const OptTile& s1, const OptTile& s2) {
+                    return std::get<1>(s1) < std::get<1>(s2);
+                });
+
+            double finalCost = 0.0;
+            std::tie(tileSize, finalCost) = optTiles[0];
+
+            numTiles = (output->desc().dim(Dim::C) + tileSize - 1) / tileSize;
+        }
+
+        //
+        // Single tile processing
+        //
+
+        if (numTiles == 1) {
+            auto constDatas = createWeigthsAndBiasesForDepthConv(
+                model,
+                weights, biases,
+                stage,
+                tileSize, 0,
+                "");
+
+            model->replaceStageInput(stage->inputEdge(1), std::get<0>(constDatas));
+            model->replaceStageInput(stage->inputEdge(2), std::get<1>(constDatas));
+
+            stage->attrs().set<int>("groupSize", 1);
+
+            continue;
+        }
+
+        //
+        // Multiple tiles processing
+        //
+
+        model->disconnectStageDatas(stage);
+
+        DataVector subInputs(numTiles);
+        DataVector subOutputs(numTiles);
+
+        int tileOffset = 0;
+        for (int tileInd = 0; tileInd < numTiles; ++tileInd) {
+            auto postfix = formatString("@tile=%d/%d", tileInd + 1, numTiles);
+
+            auto curTileSize = tileInd != numTiles - 1 ? tileSize : input->desc().dim(Dim::C) - tileOffset;
+
+            auto inputTileDesc = input->desc();
+            inputTileDesc.setDim(Dim::C, curTileSize);
+
+            subInputs[tileInd] = model->duplicateData(
+                input,
+                postfix,
+                inputTileDesc);
+
+            auto outputTileDesc = output->desc();
+            outputTileDesc.setDim(Dim::C, curTileSize);
+
+            subOutputs[tileInd] = model->duplicateData(
+                output,
+                postfix,
+                outputTileDesc);
+
+            auto constDatas = createWeigthsAndBiasesForDepthConv(
+                model,
+                weights, biases,
+                stage,
+                curTileSize, tileOffset,
+                postfix);
+
+            auto tileWeights = std::get<0>(constDatas);
+            auto tileBiases = std::get<1>(constDatas);
+
+            auto tileStage = model->duplicateStage(
+                stage->name() + postfix,
+                stage,
+                {subInputs[tileInd], tileWeights, tileBiases},
+                {subOutputs[tileInd]});
+
+            tileStage->attrs().set<int>("groupSize", 1);
+
+            tileOffset += curTileSize;
+        }
+
+        _stageBuilder->addSplitStage(
+            model,
+            stage->name() + "@split",
+            stage->origLayer(),
+            Dim::C,
+            input,
+            subInputs);
+
+        _stageBuilder->addConcatStage(
+            model,
+            stage->name() + "@concat",
+            stage->origLayer(),
+            Dim::C,
+            subOutputs,
+            output);
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::splitHwDepthConv() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/sw_conv_adaptation.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/sw_conv_adaptation.cpp
new file mode 100644 (file)
index 0000000..18d37a7
--- /dev/null
@@ -0,0 +1,504 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+#include <limits>
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <unordered_set>
+#include <set>
+
+#include <vpu/sw/utility.hpp>
+
+#define REFERENCE_CONVOLUTION 0
+
+namespace vpu {
+
+namespace {
+
+class ConvIm2ColWeightsContent final : public CalculatedDataContent {
+public:
+    explicit ConvIm2ColWeightsContent(const DataContent::Ptr& origContent) :
+            CalculatedDataContent({origContent}) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(ConvIm2ColWeightsContent);
+        kchw_to_khwc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+    }
+};
+
+class Conv3x3WeightsContent final : public CalculatedDataContent {
+public:
+    explicit Conv3x3WeightsContent(const DataContent::Ptr& origContent) :
+            CalculatedDataContent({origContent}) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(Conv3x3WeightsContent);
+        kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+    }
+};
+
+class ConvCHWWeightsContent final : public CalculatedDataContent {
+public:
+    explicit ConvCHWWeightsContent(const DataContent::Ptr& origContent) :
+            CalculatedDataContent({origContent}) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(ConvCHWWeightsContent);
+        kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+    }
+};
+
+class ConvStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ConvStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto finalOrder = input->desc().dimsOrder();
+        if (finalOrder.dimInd(Dim::C) == 1) {
+            // HCW -> CHW
+            finalOrder.moveDim(Dim::C, 2);
+        }
+
+        DataMap<DimsOrder> out;
+
+        if (_type == StageType::Conv ||
+            _type == StageType::Im2ColConvolution) {
+            if (finalOrder != input->desc().dimsOrder()) {
+                out[input] = finalOrder;
+            }
+            out[output] = finalOrder;
+        } else if (_type == StageType::DepthConv) {
+            if (finalOrder != input->desc().dimsOrder()) {
+                out[input] = finalOrder;
+            }
+            out[output] = finalOrder;
+        } else {
+            out[input] = finalOrder.createMovedDim(Dim::C, 0);
+            out[output] = finalOrder.createMovedDim(Dim::C, 0);
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        if (_type != StageType::DepthConv) {
+            out[input] = StridesRequirement::compact();
+            out[output] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto kernelSizeX = attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = attrs().get<int>("kernelSizeY");
+
+        Data swWeights;
+
+        if (_type == StageType::DepthConv) {
+            swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+            if (swWeights == nullptr) {
+                DataDesc newWeightsDesc({
+                    kernelSizeX * kernelSizeY,
+                    1,
+                    output->desc().dim(Dim::C)});
+
+                swWeights = _model->duplicateData(
+                    weights,
+                    "@SW",
+                    newWeightsDesc,
+                    std::make_shared<DefaultSwWeightsContent>(weights->content()));
+
+                weights->attrs().set<Data>("swWeights", swWeights);
+            }
+        } else if (input->desc().dimsOrder().dimInd(Dim::C) == 0) {
+            //
+            // HWC case
+            //
+
+            auto isSpatialConv = attrs().get<bool>("isSpatialConv");
+            auto isConv1x1 = attrs().get<bool>("isConv1x1");
+            auto isConv3x3 = attrs().get<bool>("isConv3x3");
+
+            swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+            if (swWeights == nullptr) {
+                DataDesc newWeightsDesc({
+                    kernelSizeX * kernelSizeY,
+                    input->desc().dim(Dim::C),
+                    output->desc().dim(Dim::C)});
+
+                if (isSpatialConv) {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<DefaultSwWeightsContent>(weights->content()));
+                } else if (isConv1x1) {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        weights->content());
+                } else if (isConv3x3) {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<Conv3x3WeightsContent>(weights->content()));
+                } else {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<ConvIm2ColWeightsContent>(weights->content()));
+                }
+
+                weights->attrs().set<Data>("swWeights", swWeights);
+            }
+        } else if (input->desc().dimsOrder().dimInd(Dim::C) == 2) {
+            //
+            // CHW case
+            //
+
+            auto isConv1x1 = attrs().get<bool>("isConv1x1");
+
+            if (_type == StageType::Im2ColConvolution) {
+                // Transform CHW "Im2ColConvolution" into CHW "Conv"
+                _type = StageType::Conv;
+            }
+
+            swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+            if (swWeights == nullptr) {
+                DataDesc newWeightsDesc({
+                    kernelSizeX * kernelSizeY,
+                    input->desc().dim(Dim::C),
+                    output->desc().dim(Dim::C)});
+
+                if (isConv1x1) {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        weights->content());
+                } else {
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<ConvCHWWeightsContent>(weights->content()));
+                }
+
+                weights->attrs().set<Data>("swWeights", swWeights);
+            }
+        }
+
+        IE_ASSERT(swWeights != nullptr);
+
+        _model->replaceStageInput(_inputEdges[1], swWeights);
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto kernelSizeX = attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = attrs().get<int>("kernelStrideY");
+        auto padLeft = attrs().get<int>("padLeft");
+        auto padTop = attrs().get<int>("padTop");
+        auto dilationX = attrs().get<int>("dilationX");
+        auto dilationY = attrs().get<int>("dilationY");
+
+        serializer.append(static_cast<uint32_t>(kernelSizeX));
+        serializer.append(static_cast<uint32_t>(kernelSizeY));
+        serializer.append(static_cast<uint32_t>(kernelStrideX));
+        serializer.append(static_cast<uint32_t>(kernelStrideY));
+        serializer.append(static_cast<uint32_t>(padLeft));
+        serializer.append(static_cast<uint32_t>(padTop));
+        serializer.append(static_cast<uint32_t>(dilationX));
+        serializer.append(static_cast<uint32_t>(dilationY));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+        weights->serializeOldBuffer(handle_from_this(), serializer);
+
+        if (!_tempBufferEdges.empty()) {
+            _tempBufferEdges[0]->tempBuffer()->serializeOldBuffer(handle_from_this(), serializer);
+        }
+
+        // TODO: remove this
+        biases->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(swConvAdaptation);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubConv)
+            continue;
+
+        auto origStageName = stage->name();
+        auto origLayer = stage->origLayer();
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases = stage->input(2);
+        auto output = stage->output(0);
+
+        auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
+        auto padLeft = stage->attrs().get<int>("padLeft");
+        auto padRight = stage->attrs().get<int>("padRight");
+        auto padTop = stage->attrs().get<int>("padTop");
+        auto padBottom = stage->attrs().get<int>("padBottom");
+        auto dilationX = stage->attrs().get<int>("dilationX");
+        auto dilationY = stage->attrs().get<int>("dilationY");
+        auto groupSize = stage->attrs().get<int>("groupSize");
+
+        model->removeStage(stage);
+
+        bool isFC = (
+            kernelSizeX == 1 && kernelSizeY == 1 &&
+            kernelStrideX == 1 && kernelStrideY == 1 &&
+            padLeft == 0 && padRight == 0 && padTop == 0 && padBottom == 0 &&
+            dilationX == 1 && dilationY == 1 &&
+            input->desc().dim(Dim::W) == 1 && input->desc().dim(Dim::H) == 1 &&
+            output->desc().dim(Dim::W) == 1 && output->desc().dim(Dim::H) == 1);
+
+        bool isConv1x1 = (
+            kernelSizeX == 1 && kernelSizeY == 1 &&
+            dilationX == 1 && dilationY == 1 &&
+            !isFC);
+
+        bool isConv3x3 = (
+            kernelSizeX == 3 && kernelSizeY == 3 &&
+            (input->desc().dim(Dim::C) / groupSize) > 3 &&
+            ((input->desc().dim(Dim::C) / groupSize) * (output->desc().dim(Dim::C) / groupSize)) > 256);
+
+        bool iskernelSizeMatchSpatial = (
+            kernelSizeX > 1 && kernelSizeX < 12 && kernelSizeX % 2 == 1);
+
+        bool isSpatialConv = (
+            iskernelSizeMatchSpatial &&
+            kernelSizeY != 1 &&  // kernelSizeX != 1 was checked in iskernelSizeMatchSpatial condition
+            ((input->desc().dim(Dim::C) / groupSize) * (output->desc().dim(Dim::C) / groupSize)) <= 256 &&
+            groupSize == 1);
+
+#if REFERENCE_CONVOLUTION
+        isSpatialConv  = false;
+        isConv3x3 = false;
+        isConv1x1 = false;
+#endif
+
+        if (groupSize == 1) {
+            if (isFC) {
+                _stageBuilder->addSwFullyConnectedStage(
+                    model,
+                    origStageName,
+                    origLayer,
+                    input,
+                    weights,
+                    biases,
+                    output);
+            } else {
+                if (biases->usage() != DataUsage::Fake) {
+                    auto tempOutput = model->duplicateData(
+                        output,
+                        "@temp");
+
+                    _stageBuilder->addBiasStage(
+                        model,
+                        origStageName + "@biases",
+                        origLayer,
+                        tempOutput, biases,
+                        output);
+
+                    output = tempOutput;
+                }
+
+                Stage swStage;
+                if (isConv1x1 || isSpatialConv || isConv3x3) {
+                    swStage = model->addNewStage<ConvStage>(
+                        origStageName,
+                        StageType::Conv,
+                        origLayer,
+                        {input, weights, biases},
+                        {output});
+                } else {
+                    swStage = model->addNewStage<ConvStage>(
+                        origStageName,
+#if REFERENCE_CONVOLUTION
+                        StageType::RefConvolution,
+#else
+                        StageType::Im2ColConvolution,
+#endif
+                        origLayer,
+                        {input, weights, biases},
+                        {output});
+
+                    double im2ColBufSizeF = static_cast<double>(kernelSizeX) * kernelSizeY *
+                        output->desc().dim(Dim::W) * output->desc().dim(Dim::H) * input->desc().dim(Dim::C)
+                        + 32;
+
+                    if (im2ColBufSizeF >= std::numeric_limits<int>::max()) {
+                        VPU_THROW_EXCEPTION << "stage: " << origStageName << ", im2col bufferSize cannot fit 32s: "
+                            << std::setprecision(0) << std::fixed << im2ColBufSizeF
+                            << "(" << kernelSizeX << "x" << kernelSizeY << "x"
+                            << output->desc().dim(Dim::W) << "x" << output->desc().dim(Dim::H) << "x" << output->desc().dim(Dim::C) << ")";
+                    }
+
+                    model->addTempBuffer(swStage, DataDesc({static_cast<int>(im2ColBufSizeF)}));
+                }
+
+                swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+                swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+                swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
+                swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+                swStage->attrs().set<int>("padLeft", padLeft);
+                swStage->attrs().set<int>("padRight", padRight);
+                swStage->attrs().set<int>("padTop", padTop);
+                swStage->attrs().set<int>("padBottom", padBottom);
+
+                swStage->attrs().set<int>("dilationX", dilationX);
+                swStage->attrs().set<int>("dilationY", dilationY);
+
+                swStage->attrs().set<bool>("isSpatialConv", isSpatialConv);
+                swStage->attrs().set<bool>("isConv1x1", isConv1x1);
+                swStage->attrs().set<bool>("isConv3x3", isConv3x3);
+            }
+        } else if (groupSize == input->desc().dim(Dim::C) &&
+                   groupSize == output->desc().dim(Dim::C)) {
+            if (biases->usage() != DataUsage::Fake) {
+                auto tempOutput = model->duplicateData(
+                    output,
+                    "@temp");
+
+                _stageBuilder->addBiasStage(
+                    model,
+                    origStageName + "@biases",
+                    origLayer,
+                    tempOutput, biases,
+                    output);
+
+                output = tempOutput;
+            }
+
+            auto swStage = model->addNewStage<ConvStage>(
+                origStageName,
+                StageType::DepthConv,
+                origLayer,
+                {input, weights, biases},
+                {output});
+
+            swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+            swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+            swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
+            swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+            swStage->attrs().set<int>("padLeft", padLeft);
+            swStage->attrs().set<int>("padRight", padRight);
+            swStage->attrs().set<int>("padTop", padTop);
+            swStage->attrs().set<int>("padBottom", padBottom);
+
+            swStage->attrs().set<int>("dilationX", dilationX);
+            swStage->attrs().set<int>("dilationY", dilationY);
+        } else {
+            VPU_THROW_EXCEPTION << "Internal error : grouped convolution was not processed";
+        }
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::swConvAdaptation() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/sw_deconv_adaptation.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/sw_deconv_adaptation.cpp
new file mode 100644 (file)
index 0000000..dc4fb9e
--- /dev/null
@@ -0,0 +1,524 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <unordered_set>
+#include <set>
+
+#include <ie_parallel.hpp>
+
+#include <vpu/sw/utility.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+void depthDeconvolutionRelayoutCHW(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int channels) {
+    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+        int iidx = c * KX * KY + ky * KX + kx;
+        IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = c * KX * KY + inv_ky * KX + inv_kx;
+        IE_ASSERT(oidx >= 0 && oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
+public:
+    DepthDeconvolutionCHWWeightsContent(
+            const DataContent::Ptr& origContent,
+            int KX, int KY, int channels) :
+            CalculatedDataContent({origContent}),
+            _KX(KX), _KY(KY), _channels(channels) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
+        depthDeconvolutionRelayoutCHW(
+            baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
+            static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
+            _KX, _KY, _channels);
+    }
+
+private:
+    int _KX;
+    int _KY;
+    int _channels;
+};
+
+void depthDeconvolutionRelayoutHWC(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int channels) {
+    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+        int iidx = c * KX * KY + ky * KX + kx;
+        IE_ASSERT(iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = inv_ky * KX * channels + inv_kx * channels + c;
+        IE_ASSERT(oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
+public:
+    DepthDeconvolutionHWCWeightsContent(
+            const DataContent::Ptr& origContent,
+            int KX, int KY, int channels) :
+            CalculatedDataContent({origContent}),
+            _KX(KX), _KY(KY), _channels(channels) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
+        depthDeconvolutionRelayoutHWC(
+            baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
+            static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
+            _KX, _KY, _channels);
+    }
+
+private:
+    int _KX;
+    int _KY;
+    int _channels;
+};
+
+void deconvolutionRelayout(
+    const fp16_t* src, int src_size,
+    fp16_t* dst, int dst_size,
+    int KX, int KY,
+    int IC, int OC) {
+    ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
+        int iidx = ic * OC * KY * KX
+                 + oc * KY * KX
+                 + ky * KX
+                 + kx;
+        IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = oc * IC * KY * KX
+                 + ic * KY * KX
+                 + inv_ky * KX
+                 + inv_kx;
+        IE_ASSERT(oidx >=  0 && oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+class DeconvolutionWeightsContent final : public CalculatedDataContent {
+public:
+    DeconvolutionWeightsContent(
+            const DataContent::Ptr& origContent,
+            int KX, int KY,
+            int IC, int OC) :
+            CalculatedDataContent({origContent}),
+            _KX(KX), _KY(KY),
+            _IC(IC), _OC(OC) {
+    }
+
+protected:
+    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
+        return 2 * _desc.totalDimSize() * sizeof(fp16_t);
+    }
+
+
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(DeconvolutionWeightsContent);
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+        auto dstPtr2 = dstPtr + _desc.totalDimSize();
+
+        deconvolutionRelayout(
+            baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
+            dstPtr2, _desc.totalDimSize(),
+            _KX, _KY,
+            _IC, _OC);
+
+        kchw_to_hwkc(dstPtr2, dstPtr, _desc);
+    }
+
+private:
+    int _KX;
+    int _KY;
+    int _IC;
+    int _OC;
+};
+
+class DeconvStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<DeconvStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto finalOrder = input->desc().dimsOrder();
+        if (finalOrder.dimInd(Dim::C) == 1) {
+            // HCW -> CHW
+            finalOrder.moveDim(Dim::C, 2);
+        }
+
+        DataMap<DimsOrder> out;
+
+        if (_type == StageType::DepthDeconv) {
+            if (finalOrder != input->desc().dimsOrder()) {
+                out[input] = finalOrder;
+            }
+            out[output] = finalOrder;
+        } else {
+            out[input] = finalOrder.createMovedDim(Dim::C, 0);
+            out[output] = finalOrder.createMovedDim(Dim::C, 0);
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto finalOrder = input->desc().dimsOrder();
+        if (finalOrder.dimInd(Dim::C) == 1) {
+            // HCW -> CHW
+            finalOrder.moveDim(Dim::C, 2);
+        }
+
+        DataMap<StridesRequirement> out;
+
+        if (_type == StageType::DepthDeconv) {
+            if (finalOrder.dimInd(Dim::C) == 0) {
+                // HWC
+                out[input] = StridesRequirement::compact();
+                out[output] = StridesRequirement::compact();
+            }
+        } else {
+            out[input] = StridesRequirement::compact();
+            out[output] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto kernelSizeX = attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = attrs().get<int>("kernelSizeY");
+
+        Data swWeights;
+
+        if (_type == StageType::DepthDeconv) {
+            if (input->desc().dimsOrder().dimInd(Dim::C) == 0) {
+                //
+                // HWC case
+                //
+
+                swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+                if (swWeights == nullptr) {
+                    DataDesc newWeightsDesc({
+                        kernelSizeX * kernelSizeY,
+                        1,
+                        output->desc().dim(Dim::C)});
+
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<DepthDeconvolutionHWCWeightsContent>(
+                            weights->content(),
+                            kernelSizeX, kernelSizeY,
+                            output->desc().dim(Dim::C)));
+
+                    weights->attrs().set<Data>("swWeights", swWeights);
+                }
+            } else if (input->desc().dimsOrder().dimInd(Dim::C) == 2) {
+                //
+                // CHW case
+                //
+
+                swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+                if (swWeights == nullptr) {
+                    DataDesc newWeightsDesc({
+                        kernelSizeX * kernelSizeY,
+                        1,
+                        output->desc().dim(Dim::C)});
+
+                    swWeights = _model->duplicateData(
+                        weights,
+                        "@SW",
+                        newWeightsDesc,
+                        std::make_shared<DepthDeconvolutionCHWWeightsContent>(
+                            weights->content(),
+                            kernelSizeX, kernelSizeY,
+                            output->desc().dim(Dim::C)));
+
+                    weights->attrs().set<Data>("swWeights", swWeights);
+                }
+            }
+        } else {
+            swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+            if (swWeights == nullptr) {
+                DataDesc newWeightsDesc({
+                    kernelSizeX * kernelSizeY,
+                    input->desc().dim(Dim::C),
+                    output->desc().dim(Dim::C)});
+
+                swWeights = _model->duplicateData(
+                    weights,
+                    "@SW",
+                    newWeightsDesc,
+                    std::make_shared<DeconvolutionWeightsContent>(
+                        weights->content(),
+                        kernelSizeX, kernelSizeY,
+                        input->desc().dim(Dim::C),
+                        output->desc().dim(Dim::C)));
+
+                weights->attrs().set<Data>("swWeights", swWeights);
+            }
+        }
+
+        IE_ASSERT(swWeights != nullptr);
+
+        _model->replaceStageInput(_inputEdges[1], swWeights);
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto kernelSizeX = attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = attrs().get<int>("kernelStrideY");
+        auto padLeft = attrs().get<int>("padLeft");
+        auto padTop = attrs().get<int>("padTop");
+        auto dilationX = attrs().get<int>("dilationX");
+        auto dilationY = attrs().get<int>("dilationY");
+
+        serializer.append(static_cast<uint32_t>(kernelSizeX));
+        serializer.append(static_cast<uint32_t>(kernelSizeY));
+        serializer.append(static_cast<uint32_t>(kernelStrideX));
+        serializer.append(static_cast<uint32_t>(kernelStrideY));
+        serializer.append(static_cast<uint32_t>(padLeft));
+        serializer.append(static_cast<uint32_t>(padTop));
+        serializer.append(static_cast<uint32_t>(dilationX));
+        serializer.append(static_cast<uint32_t>(dilationY));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+        weights->serializeOldBuffer(handle_from_this(), serializer);
+
+        if (!_tempBufferEdges.empty()) {
+            _tempBufferEdges[0]->tempBuffer()->serializeOldBuffer(handle_from_this(), serializer);
+        }
+
+        // TODO: remove this
+        biases->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(swDeconvAdaptation);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubDeconv)
+            continue;
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases = stage->input(2);
+        auto output = stage->output(0);
+
+        auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
+        auto padLeft = stage->attrs().get<int>("padLeft");
+        auto padRight = stage->attrs().get<int>("padRight");
+        auto padTop = stage->attrs().get<int>("padTop");
+        auto padBottom = stage->attrs().get<int>("padBottom");
+        auto dilationX = stage->attrs().get<int>("dilationX");
+        auto dilationY = stage->attrs().get<int>("dilationY");
+        auto groupSize = stage->attrs().get<int>("groupSize");
+
+        model->disconnectStageDatas(stage);
+
+        if (groupSize == 0 ||
+            (groupSize > input->desc().dim(Dim::C)) ||
+            (input->desc().dim(Dim::C) % groupSize != 0) ||
+            (groupSize > output->desc().dim(Dim::C)) ||
+            (output->desc().dim(Dim::C) % groupSize != 0)) {
+            VPU_THROW_EXCEPTION << "DeconvolutionLayer has invalid group value";
+        }
+
+        if (groupSize == 1) {
+            if (biases->usage() != DataUsage::Fake) {
+                auto tempOutput = model->duplicateData(
+                    output,
+                    "@temp");
+
+                _stageBuilder->addBiasStage(
+                    model,
+                    stage->name() + "@biases",
+                    stage->origLayer(),
+                    tempOutput, biases,
+                    output);
+
+                output = tempOutput;
+            }
+
+            auto swStage = model->addNewStage<DeconvStage>(
+                stage->name(),
+                StageType::Deconvolution,
+                stage->origLayer(),
+                {input, weights, biases},
+                {output});
+
+            swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+            swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+            swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
+            swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+            swStage->attrs().set<int>("padLeft", padLeft);
+            swStage->attrs().set<int>("padRight", padRight);
+            swStage->attrs().set<int>("padTop", padTop);
+            swStage->attrs().set<int>("padBottom", padBottom);
+
+            swStage->attrs().set<int>("dilationX", dilationX);
+            swStage->attrs().set<int>("dilationY", dilationY);
+        } else if (groupSize == input->desc().dim(Dim::C) &&
+                   groupSize == output->desc().dim(Dim::C)) {
+            if (biases->usage() != DataUsage::Fake) {
+                auto tempOutput = model->duplicateData(
+                    output,
+                    "@temp");
+
+                _stageBuilder->addBiasStage(
+                    model,
+                    stage->name() + "@biases",
+                    stage->origLayer(),
+                    tempOutput, biases,
+                    output);
+
+                output = tempOutput;
+            }
+
+            auto swStage = model->addNewStage<DeconvStage>(
+                stage->name(),
+                StageType::DepthDeconv,
+                stage->origLayer(),
+                {input, weights, biases},
+                {output});
+
+            swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+            swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+            swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
+            swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+            swStage->attrs().set<int>("padLeft", padLeft);
+            swStage->attrs().set<int>("padRight", padRight);
+            swStage->attrs().set<int>("padTop", padTop);
+            swStage->attrs().set<int>("padBottom", padBottom);
+
+            swStage->attrs().set<int>("dilationX", dilationX);
+            swStage->attrs().set<int>("dilationY", dilationY);
+        } else {
+            VPU_THROW_EXCEPTION << "Internal error : grouped deconvolution was not processed";
+        }
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::swDeconvAdaptation() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/sw_fc_adaptation.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/sw_fc_adaptation.cpp
new file mode 100644 (file)
index 0000000..28c5ebe
--- /dev/null
@@ -0,0 +1,238 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <set>
+
+#include <vpu/sw/utility.hpp>
+
+namespace vpu {
+
+namespace {
+
+class FullyConnectedStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<FullyConnectedStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input] = input->desc().dimsOrder().createMovedDim(Dim::C, 0);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 0);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
+        if (swWeights == nullptr) {
+            swWeights = _model->duplicateData(
+                weights,
+                "@SW",
+                weights->desc(),
+                std::make_shared<DefaultSwWeightsContent>(weights->content()));
+
+            weights->attrs().set<Data>("swWeights", swWeights);
+        }
+
+        _model->replaceStageInput(_inputEdges[1], swWeights);
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+
+        if (output->desc().dimsOrder() == DimsOrder::NC) {
+            IE_ASSERT(output->desc().dim(Dim::N) == 1);
+
+            output->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::HWC,
+                {
+                    {Dim::W, {Dim::N}},
+                    {Dim::C, {Dim::C}}
+                });
+        } else {
+            output->serializeOldBuffer(handle_from_this(), serializer);
+        }
+
+        weights->serializeOldBuffer(handle_from_this(), serializer);
+
+        // TODO: remove this
+        biases->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
+
+    void run(const Model::Ptr& model) override;
+
+private:
+    StageBuilder::Ptr _stageBuilder;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(swFullyConnectedAdaptation);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubFullyConnected)
+            continue;
+
+        auto input = stage->input(0);
+        auto weights = stage->input(1);
+        auto biases = stage->input(2);
+        auto output = stage->output(0);
+
+        model->disconnectStageDatas(stage);
+
+        if (biases->usage() != DataUsage::Fake) {
+            auto tempOutput = model->duplicateData(
+                output,
+                "@temp");
+
+            _stageBuilder->addBiasStage(
+                model,
+                stage->name() + "@biases",
+                stage->origLayer(),
+                tempOutput, biases,
+                output);
+
+            output = tempOutput;
+        }
+
+        model->addNewStage<FullyConnectedStage>(
+            stage->name(),
+            StageType::FC,
+            stage->origLayer(),
+            {input, weights, biases},
+            {output});
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::swFullyConnectedAdaptation() {
+    return std::make_shared<PassImpl>(_stageBuilder);
+}
+
+Stage StageBuilder::addSwFullyConnectedStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& weights,
+        const Data& biases,
+        Data output) {
+    auto fcWeights = model->duplicateData(
+        weights,
+        "@fc",
+        DataDesc({
+            input->desc().dim(Dim::W, 1) * input->desc().dim(Dim::H, 1),
+            input->desc().dim(Dim::C),
+            output->desc().dim(Dim::C)}));
+
+    if (biases->usage() != DataUsage::Fake) {
+        auto tempOutput = model->duplicateData(
+            output,
+            "@temp");
+
+        addBiasStage(
+            model,
+            name + "@biases",
+            layer,
+            tempOutput, biases,
+            output);
+
+        output = tempOutput;
+    }
+
+    auto fcStage = model->addNewStage<FullyConnectedStage>(
+        name,
+        StageType::FC,
+        layer,
+        {input, fcWeights, biases},
+        {output});
+
+    return fcStage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/sw_pooling_adaptation.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/sw_pooling_adaptation.cpp
new file mode 100644 (file)
index 0000000..134d4d8
--- /dev/null
@@ -0,0 +1,232 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class PoolStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PoolStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        auto finalOrder = input->desc().dimsOrder();
+        if (input->desc().dim(Dim::N, 1) > 1) {
+            // To merge batch into channels
+            finalOrder = finalOrder.createMovedDim(Dim::C, 2);
+        }
+
+        out[input] = finalOrder;
+        out[output] = finalOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto dimsOrder = input->desc().dimsOrder();
+
+        StridesRequirement reqs;
+
+        if (input->desc().dim(Dim::N, 1) > 1) {
+            // To merge batch into previous dimension.
+            reqs.add(dimsOrder.dimInd(Dim::N), DimStride::Compact);
+        }
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = reqs;
+        out[output] = reqs;
+
+        //
+        // * AvgPool/MaxPool support both YXZ and ZYX orders:
+        //   * ZYX versions support both input and output strides.
+        //   * YXZ versions support only output strides.
+        // * GlobalPooling supports both 3D/4D layouts.
+        //
+
+        if (_type == StageType::MaxPool || _type == StageType::AvgPool) {
+            if (dimsOrder.dimInd(Dim::C) == 0) {
+                out[input] = StridesRequirement::compact();
+            }
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
+        // Pooling will support batch by merging it with previous dimension.
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto kernelSizeX = attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = attrs().get<int>("kernelStrideY");
+        auto padLeft = attrs().get<int>("padLeft");
+        auto padTop = attrs().get<int>("padTop");
+        auto excludePad = attrs().get<bool>("excludePad");
+
+        serializer.append(static_cast<uint32_t>(kernelSizeX));
+        serializer.append(static_cast<uint32_t>(kernelSizeY));
+        serializer.append(static_cast<uint32_t>(kernelStrideX));
+        serializer.append(static_cast<uint32_t>(kernelStrideY));
+        serializer.append(static_cast<uint32_t>(padLeft));
+        serializer.append(static_cast<uint32_t>(padTop));
+        serializer.append(static_cast<uint32_t>(excludePad));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        if (_type == StageType::GlobalMaxPool ||
+            _type == StageType::GlobalAvgPool) {
+            input->serializeNewBuffer(serializer);
+            output->serializeNewBuffer(serializer);
+        } else {
+            auto perm = input->desc().dimsOrder().toPermutation();
+            IE_ASSERT(perm.size() == 4);
+            IE_ASSERT(perm.back() == Dim::N);
+
+            perm.pop_back();
+
+            input->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::fromPermutation(perm),
+                {
+                    {perm[2], {perm[2], Dim::N}},
+                    {perm[1], {perm[1]}},
+                    {perm[0], {perm[0]}}
+                });
+
+            output->serializeOldBuffer(
+                handle_from_this(),
+                serializer,
+                DimsOrder::fromPermutation(perm),
+                {
+                    {perm[2], {perm[2], Dim::N}},
+                    {perm[1], {perm[1]}},
+                    {perm[0], {perm[0]}}
+                });
+        }
+    }
+};
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(swPoolAdaptation);
+
+    for (const auto& stage : model->getStages()) {
+        if (stage->type() != StageType::StubMaxPool &&
+            stage->type() != StageType::StubAvgPool) {
+            continue;
+        }
+
+        auto input = stage->input(0);
+        auto output = stage->output(0);
+
+        auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
+        auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
+        auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
+        auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
+        auto padLeft = stage->attrs().get<int>("padLeft");
+        auto padRight = stage->attrs().get<int>("padRight");
+        auto padTop = stage->attrs().get<int>("padTop");
+        auto padBottom = stage->attrs().get<int>("padBottom");
+        auto excludePad = stage->attrs().get<bool>("excludePad");
+
+        model->disconnectStageDatas(stage);
+
+        auto stageType = StageType::None;
+        if (stage->type() == StageType::StubMaxPool) {
+            if (padLeft == 0 && padRight == 0 && padTop == 0 && padBottom == 0 &&
+                output->desc().dim(Dim::W) == 1 && output->desc().dim(Dim::H) == 1) {
+                stageType = StageType::GlobalMaxPool;
+            } else {
+                stageType = StageType::MaxPool;
+            }
+        } else {
+            if (padLeft == 0 && padRight == 0 && padTop == 0 && padBottom == 0 &&
+                output->desc().dim(Dim::W) == 1 && output->desc().dim(Dim::H) == 1) {
+                stageType = StageType::GlobalAvgPool;
+            } else {
+                stageType = StageType::AvgPool;
+            }
+        }
+
+        auto swStage = model->addNewStage<PoolStage>(
+            stage->name(),
+            stageType,
+            stage->origLayer(),
+            {input},
+            {output});
+
+        swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
+        swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+        swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
+        swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+        swStage->attrs().set<int>("padLeft", padLeft);
+        swStage->attrs().set<int>("padRight", padRight);
+        swStage->attrs().set<int>("padTop", padTop);
+        swStage->attrs().set<int>("padBottom", padBottom);
+
+        swStage->attrs().set<bool>("excludePad", excludePad);
+
+        model->removeStage(stage);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::swPoolAdaptation() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/swap_concat_and_hw_ops.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/swap_concat_and_hw_ops.cpp
new file mode 100644 (file)
index 0000000..9c931b0
--- /dev/null
@@ -0,0 +1,156 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <utility>
+
+namespace vpu {
+
+namespace {
+
+class PassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void PassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(swapConcatAndHwOps);
+
+    for (const auto& concatStage : model->getStages()) {
+        if (concatStage == nullptr)
+            continue;
+
+        if (concatStage->type() != StageType::Concat)
+            continue;
+
+        IE_ASSERT(concatStage->numInputs() > 0);
+        IE_ASSERT(concatStage->numOutputs() == 1);
+
+        auto concatOutput = concatStage->output(0);
+
+        //
+        // Check concat axis
+        //
+
+        // TODO: other cases?
+        auto concatAxis = concatStage->attrs().getOrDefault<Dim>("axis", Dim::Invalid);
+        if (concatAxis != Dim::C) {
+            continue;
+        }
+
+        //
+        // All concat inputs must be used by concat only
+        //
+
+        bool concatIsTheOnlyConsumer = true;
+        for (const auto& concatInput : concatStage->inputs()) {
+            if (concatInput->numConsumers() != 1) {
+                concatIsTheOnlyConsumer = false;
+                break;
+            }
+        }
+        if (!concatIsTheOnlyConsumer) {
+            continue;
+        }
+
+        //
+        // Collect next stages (HW Pool and ReLU)
+        //
+
+        StageVector nextStages;
+        nextStages.reserve(2);
+
+        for (auto curOutput = concatOutput;;) {
+            if (curOutput->usage() != DataUsage::Intermediate) {
+                break;
+            }
+
+            if (curOutput->numConsumers() != 1) {
+                break;
+            }
+
+            auto curConsumer = curOutput->singleConsumer();
+            auto curConsumerHW = curConsumer->attrs().getOrDefault<bool>("tryHW", false);
+
+            if (curConsumer->type() == StageType::StubMaxPool && curConsumerHW) {
+                // OK
+            } else if (curConsumer->type() == StageType::Relu ||
+                       curConsumer->type() == StageType::LeakyRelu) {
+                // OK
+            } else {
+                break;
+            }
+
+            nextStages.emplace_back(curConsumer);
+
+            curOutput = curConsumer->output(0);
+        }
+
+        if (nextStages.empty())
+            continue;
+
+        //
+        // Swap next stages and concat
+        //
+
+        auto lastInputs = toVector(concatStage->inputs(), concatStage->numInputs());
+        auto lastOutput = concatOutput;
+
+        for (const auto& nextStage : nextStages) {
+            auto nextOutput = nextStage->output(0);
+
+            model->disconnectStageDatas(nextStage);
+
+            DataVector newOutputs;
+            newOutputs.reserve(lastInputs.size());
+
+            int subInd = 0;
+            for (const auto& curInput : lastInputs) {
+                auto postfix = formatString("@sub=%d/%d", subInd + 1, lastInputs.size());
+
+                auto newDesc = nextOutput->desc();
+                newDesc.setDim(Dim::C, curInput->desc().dim(Dim::C));
+
+                auto newOutput = model->duplicateData(
+                    nextOutput,
+                    postfix,
+                    newDesc);
+
+                model->duplicateStage(
+                    nextStage->name() + postfix,
+                    nextStage,
+                    {curInput},
+                    {newOutput});
+
+                newOutputs.emplace_back(std::move(newOutput));
+
+                ++subInd;
+            }
+
+            model->removeStage(nextStage);
+
+            lastInputs.swap(newOutputs);
+            lastOutput = nextOutput;
+        }
+
+        for (const auto& inEdge : concatStage->inputEdges()) {
+            model->replaceStageInput(inEdge, lastInputs.at(inEdge->portInd()));
+        }
+        model->replaceStageOutput(concatStage->outputEdge(0), lastOutput);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::swapConcatAndHwOps() {
+    return std::make_shared<PassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/passes/weights_analysis.cpp b/inference-engine/src/vpu/graph_transformer/src/passes/weights_analysis.cpp
new file mode 100644 (file)
index 0000000..4b37be4
--- /dev/null
@@ -0,0 +1,327 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/pass_manager.hpp>
+
+#include <cmath>
+
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <list>
+#include <set>
+
+#include <precision_utils.h>
+
+#include <vpu/utils/numeric.hpp>
+
+#include <details/caseless.hpp>
+
+namespace vpu {
+
+namespace {
+
+const short largestExp   = 15;
+const short smallestExp  = -15;
+const short exponentBias = 15;
+
+short calculateExponent(short src) {
+    short exponent;
+    const short numberBitsFP16Mantiss = 10;
+    const short numberBitsFP16Exponent = 5;
+    src = src >> numberBitsFP16Mantiss;
+
+    exponent = (src & ((1 << numberBitsFP16Exponent)-1));
+    exponent -= exponentBias;
+    return exponent;
+}
+
+std::vector<short> calculateExponents(const fp16_t* srcPtr, int count) {
+    std::vector<short> exponents(count);
+
+    for (int i = 0; i < count; ++i) {
+        exponents[i] = calculateExponent(srcPtr[i]);
+    }
+
+    return exponents;
+}
+
+short getModaValue(const std::vector<short>& exponents) {
+    const int countExps = 32;
+    std::vector<int> count(countExps);
+    for (int i = 0; i < countExps; i++) {
+        count[i] = 0;
+    }
+
+    for (int i = 0; i < exponents.size(); i++) {
+        count[exponents[i] + exponentBias]++;
+    }
+    int medianIndex = 0;
+    int medianValue = 0;
+
+    for (int i = 0; i < countExps; i++) {
+        if (count[i] > medianValue) {
+            medianValue = count[i];
+            medianIndex = i;
+        }
+    }
+    return medianIndex - exponentBias;
+}
+
+int getMeanValue(const std::vector<short>& exponents) {
+    double sum = 0;
+    int realSize = 0;
+    for (int i = 0; i < exponents.size(); i++) {
+        if (exponents[i] != smallestExp) {
+            sum += exponents[i];
+            realSize++;
+        }
+    }
+
+    if (realSize == 0) {
+        return -15;
+    } else {
+        return sum / realSize;
+    }
+}
+
+bool isScalable(const Stage& stage) {
+    if (stage->type() != StageType::StubConv &&
+        stage->type() != StageType::StubFullyConnected &&
+        stage->type() != StageType::StubDeconv) {
+        return false;
+    }
+
+    auto tryHW = stage->attrs().getOrDefault<bool>("tryHW", false);
+    if (!tryHW) {
+        return false;
+    }
+
+    return true;
+}
+
+class SingleScalePassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+void SingleScalePassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(estimateSingleNetworkScale);
+
+    if (model->attrs().get<int>("numInputs") > 1) {
+        return;
+    }
+
+    std::vector<short> modaExponents;
+    std::vector<int> meanValueExponents;
+
+    modaExponents.reserve(model->numStages());
+    meanValueExponents.reserve(model->numStages());
+
+    long long int bigAcc = 0.0;
+    int realSize = 0;
+    const int thresholdExp = -5;
+
+    for (const auto& stage : model->getStages()) {
+        if (!isScalable(stage)) {
+            continue;
+        }
+
+        auto weights = stage->input(1);
+
+        auto content = weights->content();
+        IE_ASSERT(content != nullptr);
+
+        auto weightsVals = content->get<fp16_t>();
+        IE_ASSERT(weightsVals != nullptr);
+
+        auto exponents =  calculateExponents(weightsVals, weights->desc().totalDimSize());
+
+        modaExponents.emplace_back(getModaValue(exponents));
+        meanValueExponents.emplace_back(getMeanValue(exponents));
+
+        for (int i = 0; i <  exponents.size(); i++) {
+            if (exponents[i] != smallestExp) {
+                bigAcc += exponents[i];
+                realSize++;
+            }
+        }
+    }
+
+    if (!meanValueExponents.empty()) {
+        if (meanValueExponents[0] < thresholdExp) {
+            if (realSize != 0) {
+                model->attrs().set<int>("inputShift", (-1) * bigAcc / realSize);
+            }
+        }
+    }
+}
+
+bool checkGrowingOutput(const Model::Ptr& model) {
+    bool removeScale = true;
+
+    for (const auto& stage : model->getStages()) {
+        auto inputScale = stage->name().find("@SCALE=");
+        auto fusedScaleShift = stage->name().find("FusedScaleShift_");
+        auto fusedPowerShift = stage->name().find("FusedPower_");
+        auto addScale = stage->name().find("Add_");
+
+        if (inputScale != std::string::npos ||
+            fusedPowerShift != std::string::npos) {
+            if (stage->type() == StageType::Power) {
+                auto powerScale = stage->attrs().get<float>("scale");
+                if (powerScale < 0.125f) {
+                    removeScale = false;
+                    break;
+                }
+            }
+        }
+
+        if (fusedScaleShift != std::string::npos ||
+            addScale != std::string::npos) {
+            if (stage->type() == StageType::ScaleShift) {
+                auto scales = stage->input(1);
+
+                auto content = scales->content();
+                IE_ASSERT(content != nullptr);
+
+                auto scalesVals = content->get<fp16_t>();
+                IE_ASSERT(scalesVals != nullptr);
+
+                for (int i = 0; i < scales->desc().totalDimSize(); ++i) {
+                    if (ie::PrecisionUtils::f16tof32(scalesVals[i]) < 0.125f) {
+                        removeScale = false;
+                        break;
+                    }
+                }
+
+                if (!removeScale) {
+                    break;
+                }
+            }
+        }
+    }
+
+    return removeScale;
+}
+
+class PerLayerScalePassImpl final : public Pass {
+public:
+    void run(const Model::Ptr& model) override;
+};
+
+
+int correctShift(int shift, bool firstStage, const std::string& type) {
+    auto caselessEq = InferenceEngine::details::CaselessEq<std::string>();
+
+    if (firstStage && shift > 10) {
+        shift -= 8;
+    }
+
+    if (caselessEq(type, "Convolution") || caselessEq(type, "Deconvolution")) {
+        shift = std::min(shift, 8);
+    } else if (caselessEq(type, "FullyConnected")) {
+        shift = std::min(shift, 9);
+    }
+
+    return shift;
+}
+
+int maxOutputExponent(const std::string& name, const InferenceEngine::NetworkStatsMap& stats) {
+    auto node_stats_it = stats.find(name);
+    IE_ASSERT(node_stats_it != stats.end());
+
+    auto& max = node_stats_it->second->_maxOutputs;
+    auto& min = node_stats_it->second->_maxOutputs;
+
+    IE_ASSERT(max.size() > 0 && min.size() > 0);
+    auto max_value = *std::max_element(max.begin(), max.end());
+    auto min_value = *std::min_element(min.begin(), min.end());
+
+    max_value = std::max(fabsf(max_value), fabsf(min_value));
+    IE_ASSERT(max_value > 0);
+    int exp = 0;
+
+    // frexp fractions float into two parts:
+    // [0.5, 1)* 2^exp
+    // while float stores value in format
+    // [1, 2) * 2^f_exp
+    // which means exp returned by frexp is f_exp + 1
+    frexp(max_value, &exp);
+    return exp - 1;
+}
+
+void PerLayerScalePassImpl::run(const Model::Ptr& model) {
+    VPU_PROFILE(analyzeWeightableLayers);
+
+    static const int scaleToExp     = 8;  // get from config?
+    static const int scaleThreshold = 1;
+
+    auto& stats  = model->nodesStats();
+
+    bool isGrowingOutput = checkGrowingOutput(model);
+
+    bool firstStage = true;
+    int  normalVal  = 0;
+
+    for (const auto& stage : model->getStages()) {
+        if (!isScalable(stage)) {
+            continue;
+        }
+
+        auto weights = stage->input(1);
+
+        auto content = weights->content();
+        IE_ASSERT(content != nullptr);
+
+        auto weightsVals = content->get<fp16_t>();
+        IE_ASSERT(weightsVals != nullptr);
+
+        auto exponents = calculateExponents(weightsVals, weights->desc().totalDimSize());
+
+        int maxExp = *std::max_element(exponents.begin(), exponents.end());
+        int shift  = largestExp - maxExp;
+
+        auto meanExp = getMeanValue(exponents);
+        shift        = std::min(-meanExp, shift);
+
+        if (stats.empty()) {
+            if (firstStage && shift < 4 && isGrowingOutput) {
+                normalVal = 5;
+            }
+
+            shift  = correctShift(shift, firstStage, stage->origLayer()->type);
+            shift -= normalVal;
+        } else {
+            int outExp = maxOutputExponent(stage->origLayer()->name, stats);  // what if outExp == 15?
+            shift      = std::min(scaleToExp - outExp, shift);
+        }
+
+        firstStage = false;
+        float scale = 1;
+        if (shift > scaleThreshold) {
+            scale = 1 << shift;
+        }
+
+        stage->attrs().set<float>("scaleFactor", scale);
+    }
+}
+
+}  // namespace
+
+Pass::Ptr PassManager::estimateSingleNetworkScale() {
+    return std::make_shared<SingleScalePassImpl>();
+}
+
+Pass::Ptr PassManager::analyzeWeightableLayers() {
+    return std::make_shared<PerLayerScalePassImpl>();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/argmax.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/argmax.cpp
new file mode 100644 (file)
index 0000000..18cb8d1
--- /dev/null
@@ -0,0 +1,124 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ArgMaxStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ArgMaxStage>(*this);
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        auto has_axis = attrs().get<bool>("has_axis");
+        if (has_axis) {
+            out[output] = input->desc().dimsOrder();
+        } else {
+            // axis<0 requires flatten so only NCHW layout is supported
+            out[input] = DimsOrder::fromNumDims(input->desc().numDims());
+            out[output] = DimsOrder::fromNumDims(output->desc().numDims());
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto out_max_val = attrs().get<int32_t>("out_max_val");
+        auto top_k = attrs().get<int32_t>("top_k");
+        auto has_axis = attrs().get<bool>("has_axis");
+        int axis_index = -1;
+        if (has_axis) {
+            auto axis = attrs().get<Dim>("axis");
+            axis_index = input->desc().dimsOrder().dimInd(axis);
+        }
+
+        // NOTE: when axis is passed into VPU, it's not an index, but a name
+        //       with meaning like 0:N, 1:C, 2:H, 3;W
+        serializer.append(static_cast<int32_t>(out_max_val));
+        serializer.append(static_cast<int32_t>(top_k));
+        serializer.append(static_cast<int32_t>(axis_index));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseArgMax(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<ArgMaxStage>(
+        layer->name,
+        StageType::ArgMax,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int32_t>("out_max_val", layer->GetParamAsInt("out_max_val"));
+    stage->attrs().set<int32_t>("top_k", layer->GetParamAsInt("top_k"));
+
+    int axis = layer->GetParamAsInt("axis", -1);
+    if (axis >= 0) {
+        auto perm = DimsOrder::fromNumDims(inputs[0]->desc().numDims()).toPermutation();
+        auto axisDim = perm[outputs[0]->desc().numDims() - 1 - axis];
+        stage->attrs().set<bool>("has_axis", true);
+        stage->attrs().set<Dim>("axis", axisDim);
+    } else {
+        stage->attrs().set<bool>("has_axis", false);
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/batch_norm.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/batch_norm.cpp
new file mode 100644 (file)
index 0000000..29e0f27
--- /dev/null
@@ -0,0 +1,135 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <cmath>
+
+#include <vector>
+#include <memory>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class BatchNormalizationWeightsContent final : public CalculatedDataContent {
+public:
+    BatchNormalizationWeightsContent(
+            const DataContent::Ptr& origContent,
+            float epsilon) :
+            CalculatedDataContent({origContent}), _epsilon(epsilon) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(BatchNormalizationWeightsContent);
+
+        auto srcPtr = baseContents[0]->get<fp16_t>();
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+        ie::parallel_for(_desc.totalDimSize(), [this, srcPtr, dstPtr](int i) {
+            float val = ie::PrecisionUtils::f16tof32(srcPtr[i]) + _epsilon;
+            val = 1.0f / std::sqrt(val);
+            dstPtr[i] = ie::PrecisionUtils::f32tof16(val);
+        });
+    }
+
+private:
+    float _epsilon;
+};
+
+class BatchNormalizationBiasesContent final : public CalculatedDataContent {
+public:
+    BatchNormalizationBiasesContent(
+            const DataContent::Ptr& origContent,
+            const DataContent::Ptr& weightsContent) :
+            CalculatedDataContent({origContent, weightsContent}) {
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
+        VPU_PROFILE(BatchNormalizationBiasesContent);
+
+        auto origPtr = baseContents[0]->get<fp16_t>();
+        auto weightsPtr = baseContents[1]->get<fp16_t>();
+
+        auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+        ie::parallel_for(_desc.totalDimSize(), [origPtr, weightsPtr, dstPtr](int i) {
+            // TODO : need to be extracted from IE layer.
+            float beta = 0.0f;
+
+            auto wVal = ie::PrecisionUtils::f16tof32(weightsPtr[i]);
+            dstPtr[i] = ie::PrecisionUtils::f32tof16(beta - wVal * ie::PrecisionUtils::f16tof32(origPtr[i]));
+        });
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseBatchNorm(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::BatchNormalizationLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    Data origWeights, origBiases;
+    std::tie(origWeights, origBiases) = getWeightsAndBiases(model, layer);
+
+    IE_ASSERT(origWeights->desc().totalDimSize() >= input->desc().dim(Dim::C));
+    auto weights = model->duplicateData(
+        origWeights,
+        "@batch-norm",
+        DataDesc({input->desc().dim(Dim::C)}),
+        std::make_shared<BatchNormalizationWeightsContent>(
+            origWeights->content(),
+            layer->epsilon));
+
+    if (origBiases->usage() != DataUsage::Fake) {
+        IE_ASSERT(origBiases->desc().totalDimSize() >= input->desc().dim(Dim::C));
+        auto biases = model->duplicateData(
+            origBiases,
+            "@batch-norm",
+            DataDesc({input->desc().dim(Dim::C)}),
+            std::make_shared<BatchNormalizationBiasesContent>(
+                origBiases->content(),
+                weights->content()));
+
+        auto tempOutput = model->duplicateData(
+            output,
+            "@temp");
+
+        _stageBuilder->addBiasStage(
+            model,
+            layer->name,
+            layer,
+            tempOutput, biases,
+            output);
+
+        output = tempOutput;
+    }
+
+    _stageBuilder->addScaleStage(
+        model,
+        layer->name,
+        layer,
+        input, weights,
+        output);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/bias.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/bias.cpp
new file mode 100644 (file)
index 0000000..4023af3
--- /dev/null
@@ -0,0 +1,123 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parseBias(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input = inputs[0];
+    auto biases = inputs[1];
+
+    auto biasesDims = biases->desc().dims();
+    if (biasesDims.size() < 4 && input->desc().numDims() == 4) {
+        biasesDims.set(Dim::N, 1);
+    }
+
+    if (input->desc().dims() != biasesDims) {
+        VPU_THROW_EXCEPTION
+            << "Current Bias layer implementation supports only equal inputs (axis 0, 1 for 4D tensor, axis 0 for other dimensions),"
+            << " layer name is " << layer->name;
+    }
+
+    if (biases->desc().numDims() < 4 && input->desc().numDims() == 4) {
+        DataDesc newBiasesDesc({
+            biases->desc().dim(Dim::W),
+            biases->desc().dim(Dim::H),
+            biases->desc().dim(Dim::C),
+            1});
+
+        auto newBiases = model->duplicateData(
+            biases,
+            "@reshaped",
+            newBiasesDesc);
+
+        _stageBuilder->addReshapeStage(
+            model,
+            newBiases->name(),
+            layer,
+            biases,
+            newBiases);
+
+        biases = newBiases;
+    }
+
+    _stageBuilder->addSumStage(
+        model,
+        layer->name,
+        layer,
+        input, biases,
+        outputs[0]);
+}
+
+namespace {
+
+class BiasStage final : public PostOpStage {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<BiasStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto biases = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input);
+
+            out[biases] = inputScale;
+            out[output] = inputScale;
+        } else {
+            // Bias can only propagate scaling, not generate.
+            out[input] = 1.0f;
+            out[biases] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addBiasStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& biases,
+        const Data& output) {
+    return model->addNewStage<BiasStage>(
+        name,
+        StageType::Bias,
+        layer,
+        {input, biases},
+        {output});
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/clamp.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/clamp.cpp
new file mode 100644 (file)
index 0000000..9b64485
--- /dev/null
@@ -0,0 +1,98 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+#include <string>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ClampStage final : public PostOpStage {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ClampStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input);
+
+            out[output] = inputScale;
+
+            attrs().get<float>("min_value") *= inputScale;
+            attrs().get<float>("max_value") *= inputScale;
+        } else {
+            // Clamp can only propagate scaling, not generate.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto min_value = attrs().get<float>("min_value");
+        auto max_value = attrs().get<float>("max_value");
+
+        serializer.append(static_cast<float>(min_value));
+        serializer.append(static_cast<float>(max_value));
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseClamp(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::ClampLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    _stageBuilder->addClampStage(model, layer->name, layer, layer->min_value,  layer->max_value, inputs[0], outputs[0]);
+}
+
+Stage StageBuilder::addClampStage(
+            const Model::Ptr& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            float min,
+            float max,
+            const Data& input,
+            const Data& output) {
+        auto stage = model->addNewStage<ClampStage>(
+                name,
+                StageType::Clamp,
+                layer,
+                {input},
+                {output});
+
+        stage->attrs().set<float>("min_value", min);
+        stage->attrs().set<float>("max_value", max);
+
+        return stage;
+    }
+
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/concat.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/concat.cpp
new file mode 100644 (file)
index 0000000..3a45276
--- /dev/null
@@ -0,0 +1,290 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <limits>
+#include <string>
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <unordered_set>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ConcatStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ConcatStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(!_inputEdges.empty());
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            // Keep the largest input scale factor.
+            auto maxScale = std::numeric_limits<float>::lowest();
+            for (const auto& inEdge : _inputEdges) {
+                maxScale = std::max(maxScale, inputScales.at(inEdge->input()));
+            }
+
+            IE_ASSERT(maxScale > 0.0f);
+
+            for (const auto& inEdge : _inputEdges) {
+                auto curScale = inputScales.at(inEdge->input());
+
+                if (!isFloatEqual(curScale, maxScale)) {
+                    out[inEdge->input()] = maxScale / curScale;
+                }
+            }
+
+            out[output] = maxScale;
+        } else {
+            // Concat can only propagate scaling.
+            for (const auto& inEdge : _inputEdges) {
+                out[inEdge->input()] = 1.0f;
+            }
+
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(!_inputEdges.empty());
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DimsOrderMap<int> dimsOrderVotes;
+        for (const auto& inEdge : _inputEdges) {
+            dimsOrderVotes[inEdge->input()->desc().dimsOrder()]++;
+        }
+
+        // Select DimsOrder with most votes.
+        // For equal votes : HCW > CHW > HWC.
+
+        DimsOrder finalOrder;
+        int curVotes = -1;
+        for (const auto& p : dimsOrderVotes) {
+            if (p.second > curVotes) {
+                finalOrder = p.first;
+                curVotes = p.second;
+            } else if (p.second == curVotes) {
+                if (p.first.numDims() >= 3) {
+                    if (p.first.dimInd(Dim::C) == 2) {
+                        finalOrder = p.first;
+                    } else if (p.first.dimInd(Dim::C) == 3 &&
+                               finalOrder.dimInd(Dim::C) != 2) {
+                        finalOrder = p.first;
+                    }
+                }
+            }
+        }
+
+        IE_ASSERT(finalOrder.numDims() > 0);
+        IE_ASSERT(curVotes > 0);
+
+        DataMap<DimsOrder> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            out[inEdge->input()] = finalOrder;
+        }
+
+        out[output] = finalOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(!_inputEdges.empty());
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        auto dimsOrder = output->desc().dimsOrder();
+
+        //
+        // Get smallest Dim over which Concat is done.
+        //
+
+        auto minConcatDimInd = dimsOrder.numDims();
+
+        for (const auto& inEdge : _inputEdges) {
+            auto input = inEdge->input();
+
+            for (const auto& p : output->desc().dims()) {
+                if (input->desc().dim(p.first) != p.second) {
+                    minConcatDimInd = std::min(minConcatDimInd, dimsOrder.dimInd(p.first));
+                }
+            }
+        }
+
+        IE_ASSERT(minConcatDimInd < dimsOrder.numDims());
+
+        //
+        // Initial StridesRequirement for inputs and output.
+        //
+
+        auto outputReqs = output->requiredStrides();
+
+        auto inputReqs = outputReqs;
+        for (int i = minConcatDimInd + 1; i < dimsOrder.numDims(); ++i) {
+            inputReqs.remove(i);
+        }
+
+        //
+        // Merge input StridesRequirement.
+        //
+
+        for (const auto& inEdge : _inputEdges) {
+            auto curInput = inEdge->input();
+            auto curInputReqs = curInput->requiredStrides();
+
+            for (int i = 0; i < minConcatDimInd + 1; ++i) {
+                if (outputReqs.get(i) == DimStride::Any) {
+                    if (curInputReqs.get(i) != DimStride::Any) {
+                        inputReqs.add(i, curInputReqs.get(i));
+                        outputReqs.add(i, curInputReqs.get(i));
+                    }
+                }
+            }
+        }
+
+        //
+        // Merge output consumers StridesRequirement.
+        //
+
+        for (const auto& consumer : output->consumers()) {
+            auto consumerInfo = consumer->getDataStridesRequirements();
+
+            auto consumerStrideIt = consumerInfo.find(output);
+            if (consumerStrideIt != consumerInfo.end()) {
+                auto consumerReqs = consumerStrideIt->second;
+
+                for (int i = 0; i < minConcatDimInd + 1; ++i) {
+                    if (outputReqs.get(i) == DimStride::Any) {
+                        if (consumerReqs.get(i) != DimStride::Any) {
+                            inputReqs.add(i, consumerReqs.get(i));
+                            outputReqs.add(i, consumerReqs.get(i));
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Return merged StridesRequirement.
+        //
+
+        DataMap<StridesRequirement> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            auto input = inEdge->input();
+            out[input] = inputReqs;
+        }
+        out[output] = outputReqs;
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseConcat(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(!inputs.empty());
+    IE_ASSERT(outputs.size() == 1);
+
+    auto output = outputs[0];
+
+    auto layer = std::dynamic_pointer_cast<ie::ConcatLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    IE_ASSERT(layer->_axis < output->desc().numDims());
+
+    auto perm = DimsOrder::fromNumDims(output->desc().numDims()).toPermutation();
+    auto axis = perm[output->desc().numDims() - 1 - layer->_axis];
+
+    _stageBuilder->addConcatStage(model, layer->name, layer, axis, inputs, output);
+}
+
+Stage StageBuilder::addConcatStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        Dim axis,
+        const DataVector& inputs,
+        const Data& output) {
+    std::vector<DimValues> offsets;
+
+    DimValues curOffset({{axis, 0}});
+    for (const auto& input : inputs) {
+        offsets.emplace_back(curOffset);
+        curOffset.set(axis, curOffset[axis] + input->desc().dim(axis));
+    }
+
+    auto stage = addConcatStage(model, name, layer, offsets, inputs, output);
+
+    stage->attrs().set("axis", axis);
+
+    return stage;
+}
+
+Stage StageBuilder::addConcatStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const std::vector<DimValues>& offsets,
+        const DataVector& inputs,
+        const Data& output) {
+    IE_ASSERT(offsets.size() == inputs.size());
+
+    auto stage = model->addNewStage<ConcatStage>(
+        name,
+        StageType::Concat,
+        layer,
+        inputs,
+        {output});
+
+    stage->attrs().set<std::vector<DimValues>>("offsets", offsets);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/convolution.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/convolution.cpp
new file mode 100644 (file)
index 0000000..745a27c
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <unordered_set>
+#include <tuple>
+#include <set>
+
+#include <ie_layers_internal.hpp>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parseConvolution(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    const auto& env = CompileEnv::get();
+
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    if (!(input->desc().numDims() == 3 || input->desc().numDims() == 4)) {
+        VPU_THROW_EXCEPTION << "Convolution supports only 3D or 4D input";
+    }
+    if (output->desc().numDims() != input->desc().numDims()) {
+        VPU_THROW_EXCEPTION << "Convolution supports only same num dims in input and output";
+    }
+
+    //
+    // Extract parameters
+    //
+
+    auto convLayer = std::dynamic_pointer_cast<ie::ConvolutionLayer>(layer);
+    IE_ASSERT(convLayer != nullptr);
+
+    int kernelSizeX = convLayer->_kernel_x;
+    int kernelSizeY = convLayer->_kernel_y;
+
+    int kernelStrideX = convLayer->_stride_x;
+    int kernelStrideY = convLayer->_stride_y;
+
+    auto paddings = getPaddings(*convLayer);
+    int padLeft = paddings.begin.exist(ie::X_AXIS) ? paddings.begin[ie::X_AXIS] : 0;
+    int padRight = paddings.end.exist(ie::X_AXIS) ? paddings.end[ie::X_AXIS] : padLeft;
+    int padTop = paddings.begin.exist(ie::Y_AXIS) ? paddings.begin[ie::Y_AXIS] : 0;
+    int padBottom = paddings.end.exist(ie::Y_AXIS) ? paddings.end[ie::Y_AXIS] : padTop;
+
+    int dilationX = convLayer->_dilation_x;
+    int dilationY = convLayer->_dilation_y;
+
+    int groupSize = convLayer->_group;
+
+    //
+    // Check if HW is applicable
+    //
+
+    auto tryHW = env.config.hwOptimization;
+
+    if (kernelStrideX != kernelStrideY) {
+        tryHW = false;
+    }
+
+    // TODO: support dilated convolution
+    if (dilationX != 1 || dilationY != 1) {
+        tryHW = false;
+    }
+
+    if (kernelSizeX > 15 || kernelSizeY > 15 || kernelStrideX > 8) {
+        tryHW = false;
+    }
+
+    if (env.netConfig.hwDisabled(layer->name)) {
+        tryHW = false;
+    }
+
+    if (output->desc().numDims() < 4) {
+        tryHW = false;
+    }
+
+    //
+    // Create const datas
+    //
+
+    Data weights, biases;
+    std::tie(weights, biases) = getWeightsAndBiases(model, layer);
+
+    IE_ASSERT(weights->desc().totalDimSize() >=
+              kernelSizeX * kernelSizeY * (input->desc().dim(Dim::C) / groupSize) * output->desc().dim(Dim::C));
+    weights = model->duplicateData(
+        weights,
+        "@conv",
+        DataDesc({
+            kernelSizeX,
+            kernelSizeY,
+            input->desc().dim(Dim::C) / groupSize,
+            output->desc().dim(Dim::C)}));
+
+    if (biases->usage() != DataUsage::Fake) {
+        IE_ASSERT(biases->desc().totalDimSize() >= output->desc().dim(Dim::C));
+        biases = model->duplicateData(
+            biases,
+            "@conv",
+            DataDesc({output->desc().dim(Dim::C)}));
+    }
+
+    //
+    // Create stub stage
+    //
+
+    auto stage = model->addNewStage<StubStage>(
+        layer->name,
+        StageType::StubConv,
+        layer,
+        {input, weights, biases},
+        {output});
+
+    stage->attrs().set<int>("kernelSizeX", kernelSizeX);
+    stage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+    stage->attrs().set<int>("kernelStrideX", kernelStrideX);
+    stage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+    stage->attrs().set<int>("padLeft", padLeft);
+    stage->attrs().set<int>("padRight", padRight);
+    stage->attrs().set<int>("padTop", padTop);
+    stage->attrs().set<int>("padBottom", padBottom);
+
+    stage->attrs().set<int>("dilationX", dilationX);
+    stage->attrs().set<int>("dilationY", dilationY);
+
+    stage->attrs().set<int>("groupSize", groupSize);
+
+    stage->attrs().set<bool>("tryHW", tryHW);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/copy.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/copy.cpp
new file mode 100644 (file)
index 0000000..b18d88c
--- /dev/null
@@ -0,0 +1,155 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <string>
+#include <vector>
+#include <list>
+#include <unordered_set>
+#include <memory>
+
+namespace vpu {
+
+void FrontEnd::parseCopy(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    _stageBuilder->addCopyStage(model, layer->name, layer, inputs[0], outputs[0]);
+}
+
+namespace {
+
+class CopyStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<CopyStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            out[output] = inputScales.at(input);
+        } else {
+            // Copy can only propagate scaling.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement().remove(0);
+        out[output] = StridesRequirement().remove(0);
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::NotNeeded;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        if (input->desc().dimsOrder() == DimsOrder::NC) {
+            if (!input->checkStrides(StridesRequirement().add(0, DimStride::Compact)) ||
+                !output->checkStrides(StridesRequirement().add(0, DimStride::Compact))) {
+                input->serializeOldBuffer(
+                    handle_from_this(),
+                    serializer,
+                    DimsOrder::CHW,
+                    {
+                        {Dim::C, {Dim::N}},
+                        {Dim::H, {Dim::C}},
+                    });
+
+                output->serializeOldBuffer(
+                    handle_from_this(),
+                    serializer,
+                    DimsOrder::CHW,
+                    {
+                        {Dim::C, {Dim::N}},
+                        {Dim::H, {Dim::C}},
+                    });
+
+                return;
+            }
+        }
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addCopyStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output) {
+    return model->addNewStage<CopyStage>(
+        name,
+        StageType::Copy,
+        layer,
+        {input},
+        {output});
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/crop.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/crop.cpp
new file mode 100644 (file)
index 0000000..86f49c2
--- /dev/null
@@ -0,0 +1,174 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class CropStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<CropStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() >= 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input0);
+
+            out[output] = inputScale;
+        } else {
+            // Crop can only propagate scaling, not generate.
+
+            for (const auto& inEdge : _inputEdges) {
+                out[inEdge->input()] = 1.0f;
+            }
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() >= 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inOrder = input->desc().dimsOrder();
+
+        DataMap<DimsOrder> out;
+
+        // HWC only
+        out[input] = inOrder.createMovedDim(Dim::C, 0);
+        out[output] = inOrder.createMovedDim(Dim::C, 0);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() >= 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() >= 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            out[inEdge->input()] =  BatchSupport::Split;
+        }
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::NotNeeded;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        const auto& offset = attrs().get<DimValues>("offset");
+
+        serializer.append(static_cast<int32_t>(offset.get(Dim::W, 0)));
+        serializer.append(static_cast<int32_t>(offset.get(Dim::H, 0)));
+        serializer.append(static_cast<int32_t>(offset.get(Dim::C, 0)));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() >= 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseCrop(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    // TODO : Crop layer in IR might have 1 or 2 inputs
+    IE_ASSERT(inputs.size() >= 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::CropLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+    IE_ASSERT(layer->axis.size() == layer->offset.size());
+
+    auto cropAxis = layer->axis[0];
+    if (cropAxis < 0) {
+        cropAxis += 4;
+    }
+
+    if (cropAxis < 0 || cropAxis > 3) {
+        VPU_THROW_EXCEPTION
+            << "Layer " << layer->name << " [" << layer->type
+            << "] has invalid axis value. Expected: 0 <= axis < 4, Actual: " << cropAxis;
+    }
+
+    if (cropAxis == 0) {
+        VPU_THROW_EXCEPTION
+            << "Layer " << layer->name << " [" << layer->type
+            << "] Can't crop batch channel";
+    }
+
+    auto stage = model->addNewStage<CropStage>(
+        layer->name,
+        StageType::Crop,
+        layer,
+        inputs,
+        outputs);
+
+    DimValues offset;
+    for (int i = 0; i < layer->offset.size(); i++) {
+        offset.set(static_cast<Dim>(3 - cropAxis - i), layer->offset[i]);
+    }
+
+    stage->attrs().set("offset", offset);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/ctc_decoder.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/ctc_decoder.cpp
new file mode 100644 (file)
index 0000000..715acfb
--- /dev/null
@@ -0,0 +1,139 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class CTCDecoderStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<CTCDecoderStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input0] = 1.0f;
+        out[input1] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        auto cInd = input->desc().dimsOrder().dimInd(Dim::C);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, cInd);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::OnlyOne;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeOldBuffer(handle_from_this(), serializer);
+        input1->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseCTCDecoder(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto ctc_merge_repeated_ = layer->GetParamAsInt("ctc_merge_repeated", 1);
+    if (ctc_merge_repeated_ != 1) {
+        VPU_THROW_EXCEPTION
+            << layer->name <<  " [" << layer->type
+            << "] has incorrect ctc_merge_repeated param value."
+            << " Kernel support case when ctc_merge_repeated_ == 1 only";
+    }
+
+    model->addNewStage<CTCDecoderStage>(
+        layer->name,
+        StageType::CTCDecoder,
+        layer,
+        inputs,
+        outputs);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp
new file mode 100644 (file)
index 0000000..70a6f74
--- /dev/null
@@ -0,0 +1,417 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <map>
+#include <unordered_set>
+#include <utility>
+
+#include <vpu/custom_layer.hpp>
+#include <vpu/utils/simple_math.hpp>
+
+namespace vpu {
+
+namespace {
+
+class KernelBinaryContent final : public DataContent {
+public:
+    explicit KernelBinaryContent(const std::string& blob) : _blob(blob) {
+        IE_ASSERT(!_blob.empty());
+    }
+
+    const void* getRaw() const override {
+        IE_ASSERT(_desc.totalDimSize() * _desc.elemSize() == _blob.length());
+        return _blob.data();
+    }
+
+private:
+    std::string _blob;
+};
+
+void printTo(std::ostream& os, const CustomLayer::Ptr& obj) {
+    os << obj->kernelAddress();
+}
+
+void printTo(DotLabel& lbl, const CustomLayer::Ptr& obj) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("kernelAddress", obj->kernelAddress());
+}
+
+class CustomStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<CustomStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        DataMap<float> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            out[inEdge->input()] = 1.0f;
+        }
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        const auto& inputOrders = attrs().get<std::map<int, DimsOrder>>("inputOrders");
+        const auto& outputOrders = attrs().get<std::map<int, DimsOrder>>("outputOrders");
+
+        DataMap<DimsOrder> out;
+
+        // last input is always OpenCL binary, so use it as is.
+        for (int i = 0; i < _inputEdges.size() - 1; i++) {
+            auto input = _inputEdges[i]->input();
+            IE_ASSERT(input != nullptr);
+
+            auto it = inputOrders.find(i);
+            if (it != inputOrders.end()) {
+                auto requiredOrder = it->second;
+                out[input] = requiredOrder;
+            }
+        }
+
+        for (const auto& outEdge : _outputEdges) {
+            auto it = outputOrders.find(outEdge->portInd());
+            if (it != outputOrders.end()) {
+                auto requiredOrder = it->second;
+                out[outEdge->output()] = requiredOrder;
+            }
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        DataMap<StridesRequirement> out;
+
+        // last input is always OpenCL binary, so use it as is.
+        for (int i = 0; i < _inputEdges.size() - 1; i++) {
+            auto input = _inputEdges[i]->input();
+            IE_ASSERT(input != nullptr);
+
+            out[input] = StridesRequirement::compact();
+        }
+
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        DataMap<BatchSupport> out;
+
+        // Last input is always OpenCL binary, so use it as is.
+        for (int i = 0; i < _inputEdges.size() - 1; i++) {
+            auto input = _inputEdges[i]->input();
+            IE_ASSERT(input != nullptr);
+
+            out[input] = BatchSupport::Split;
+        }
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = BatchSupport::Split;
+        }
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        const auto& customLayer = attrs().get<CustomLayer::Ptr>("customLayer");
+        const auto& gws = attrs().get<std::vector<int>>("gws");
+        const auto& lws = attrs().get<std::vector<int>>("lws");
+
+        //
+        // GWG, LWG, Offs
+        //
+
+        for (auto x : gws) {
+            serializer.append(static_cast<uint32_t>(x));
+        }
+
+        for (auto x : lws) {
+            serializer.append(static_cast<uint32_t>(x));
+        }
+
+        for (int i = 0; i < lws.size(); ++i) {
+            serializer.append(static_cast<uint32_t>(0));
+        }
+
+        //
+        // Entry point
+        //
+
+        serializer.append(static_cast<uint32_t>(customLayer->kernelAddress(lws[0])));
+
+        //
+        // Total number of blobs
+        //
+
+        serializer.append(static_cast<int32_t>(_inputEdges.size() + _outputEdges.size()));
+
+        //
+        // Number of kernel parameters
+        //
+
+        serializer.append(static_cast<uint32_t>(customLayer->parameters().size()));
+
+        //
+        // Parameters & relocation info
+        //
+
+        std::map<std::string, CustomLayer::KernelParam> b2b;
+        for (const auto& kp : customLayer->bindings()) {
+            b2b[kp.argName] = kp;
+        }
+
+        IE_ASSERT(_origLayer != nullptr);
+
+        for (const auto& kp : customLayer->parameters()) {
+            const auto& parameter = b2b[kp];
+
+            switch (parameter.type) {
+                case CustomParamType::Input:
+                {
+                    serializer.append(static_cast<uint32_t>(0));
+                    serializer.append(static_cast<uint32_t>(parameter.portIndex));
+                    break;
+                }
+                case CustomParamType::Output:
+                {
+                    serializer.append(static_cast<uint32_t>((uint32_t)0));
+                    serializer.append(static_cast<uint32_t>(_inputEdges.size() + parameter.portIndex));
+                    break;
+                }
+                case CustomParamType::Data:
+                {
+                    // TODO: handle data
+                    break;
+                }
+                case CustomParamType::Int:
+                case CustomParamType::Float:
+                {
+                    if (_origLayer->params.find(parameter.irSource) != _origLayer->params.end()) {
+                        if (parameter.type == CustomParamType::Int) {
+                            serializer.append(static_cast<int32_t>(std::stoi(_origLayer->params[parameter.irSource]) ));
+                            serializer.append(static_cast<int32_t>(-1));
+                        } else {
+                            serializer.append(static_cast<float>(std::stof(_origLayer->params[parameter.irSource]) ));
+                            serializer.append(static_cast<int32_t>(-2));
+                        }
+                        break;
+                    } else {
+                        auto pos = parameter.irSource.find_first_of('.');
+                        if (pos != std::string::npos) {
+                            auto blob = parameter.irSource.substr(0, pos);
+                            auto dim = parameter.irSource.substr(pos + 1, std::string::npos);
+
+                            ie::DataPtr origData;
+                            if (blob == "I") {
+                                origData = _origLayer->insData[0].lock();
+                            } else {
+                                origData = _origLayer->outData[0];
+                            }
+                            IE_ASSERT(origData != nullptr);
+
+                            auto dims = origData->dims;
+
+                            const std::map<char, int> vars = {
+                                { 'b', 3 }, { 'B', 3 },
+                                { 'f', 2 }, { 'F', 2 },
+                                { 'y', 1 }, { 'Y', 1 },
+                                { 'x', 0 }, { 'X', 0 },
+                            };
+
+                            if (vars.find(dim[0]) != vars.end()) {
+                                auto res = dims[vars.at(dim[0])];
+
+                                serializer.append(static_cast<uint32_t>(res));
+                                serializer.append(static_cast<int32_t>(-1));
+                            } else {
+                                VPU_THROW_EXCEPTION
+                                    << "Unable to deduce parameter " << parameter.argName << " for "
+                                    << _origLayer->type <<" layer. Name is: " << _origLayer->name;
+                            }
+
+                            break;
+                        }
+
+                        VPU_THROW_EXCEPTION
+                            << "Unable to deduce parameter " << parameter.argName << " for "
+                            << _origLayer->type <<" layer. Name is: " << _origLayer->name;
+                    }
+                }
+                default:
+                    VPU_THROW_EXCEPTION
+                        << "Unable to deduce parameter " << parameter.argName << " for "
+                        << _origLayer->type <<" layer. Name is: " << _origLayer->name;
+            }
+        }
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        for (const auto& inEdge : _inputEdges) {
+            inEdge->input()->serializeOldBuffer(handle_from_this(), serializer);
+        }
+
+        for (const auto& outEdge : _outputEdges) {
+            outEdge->output()->serializeOldBuffer(handle_from_this(), serializer);
+        }
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseCustom(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(layer != nullptr);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto customLayerIt = _customLayers.find(layer->type);
+    IE_ASSERT(customLayerIt != _customLayers.end());
+
+    auto customLayer = customLayerIt->second;
+
+    auto kernelBinaryDesc = DataDesc({customLayer->kernelBinary().length()});
+    kernelBinaryDesc.setType(DataType::U8);
+
+    auto kernelBinary = model->addConstData(
+        layer->name + "@kernelBinary",
+        kernelBinaryDesc,
+        std::make_shared<KernelBinaryContent>(customLayer->kernelBinary()));
+
+    auto allInputs = inputs;
+    allInputs.emplace_back(std::move(kernelBinary));
+
+    auto stage = model->addNewStage<CustomStage>(
+        layer->name,
+        StageType::Custom,
+        layer,
+        allInputs,
+        outputs);
+
+    stage->attrs().set("customLayer", customLayer);
+
+    auto dims = layer->outData[0]->getTensorDesc().getDims();
+    std::reverse(dims.begin(), dims.end());
+
+    // assume output tensor is dimension source by default
+    auto batchDim = (dims.size() > 0) ? dims[0] : 1;
+    auto featureDim = (dims.size() > 1) ? dims[1] : 1;
+    auto yDim = (dims.size() > 2) ? dims[2] : 1;
+    auto xDim = (dims.size() > 3) ? dims[3] : 1;
+
+    int iidx = customLayer->inputDimSourceIndex();
+    if (iidx >= 0) {
+        IE_ASSERT(iidx < layer->insData.size());
+
+        auto origData = layer->insData[iidx].lock();
+        IE_ASSERT(origData != nullptr);
+
+        auto inputDims = origData->dims;
+
+        batchDim = featureDim = yDim = 0;
+        xDim = inputDims[0];
+
+        if (dims.size() > 1)
+            yDim = inputDims[1];
+        if (dims.size() > 2)
+            featureDim = inputDims[2];
+        if (dims.size() > 3)
+            batchDim = inputDims[3];
+    }
+
+    // evaluate work sizes rules
+    std::vector<int> gws;
+    std::vector<int> lws;
+
+    const std::map<char, int> vars = {
+        { 'b', batchDim },   { 'B', batchDim },
+        { 'f', featureDim }, { 'F', featureDim },
+        { 'y', yDim },       { 'Y', yDim },
+        { 'x', xDim },       { 'X', xDim },
+    };
+
+    for (const auto& rule : customLayer->globalSizeRules()) {
+        SimpleMathExpression expr;
+        expr.setVariables(vars);
+        expr.parse(rule);
+        gws.emplace_back(expr.evaluate());
+    }
+    while (gws.size() < 3) {
+        gws.emplace_back(1);
+    }
+
+    for (const auto& rule : customLayer->localSizeRules()) {
+        SimpleMathExpression expr;
+        expr.setVariables(vars);
+        expr.parse(rule);
+        lws.emplace_back(expr.evaluate());
+    }
+    while (lws.size() < 3) {
+        lws.emplace_back(1);
+    }
+
+    stage->attrs().set("gws", gws);
+    stage->attrs().set("lws", lws);
+
+    std::map<int, DimsOrder> inputOrders;
+    std::map<int, DimsOrder> outputOrders;
+
+    std::map<std::string, CustomLayer::KernelParam> b2b;
+    for (const auto& kp : customLayer->bindings()) {
+        b2b[kp.argName] = kp;
+    }
+
+    const std::map<CustomDataFormat, DimsOrder> formats = {
+        { CustomDataFormat::BYXF, DimsOrder::NHWC },
+        { CustomDataFormat::BFYX, DimsOrder::NCHW }
+    };
+
+    for (const auto& kp : customLayer->parameters()) {
+        const auto& parameter = b2b[kp];
+
+        if (parameter.type == CustomParamType::Input) {
+            auto it = formats.find(parameter.format);
+            if (it != formats.end()) {
+                auto requiredOrder = it->second;
+                inputOrders[parameter.portIndex] = requiredOrder;
+            }
+        }
+
+        if (parameter.type == CustomParamType::Output) {
+            auto it = formats.find(parameter.format);
+            if (it != formats.end()) {
+                auto requiredOrder = it->second;
+                outputOrders[parameter.portIndex] = requiredOrder;
+            }
+        }
+    }
+
+    stage->attrs().set("inputOrders", inputOrders);
+    stage->attrs().set("outputOrders", outputOrders);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/deconvolution.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/deconvolution.cpp
new file mode 100644 (file)
index 0000000..3915dcd
--- /dev/null
@@ -0,0 +1,116 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <set>
+
+#include <ie_layers_internal.hpp>
+
+#include <vpu/stub_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parseDeconvolution(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    //
+    // Extract parameters
+    //
+
+    auto deconvLayer = std::dynamic_pointer_cast<ie::DeconvolutionLayer>(layer);
+    IE_ASSERT(deconvLayer != nullptr);
+
+    int kernelSizeX = deconvLayer->_kernel_x;
+    int kernelSizeY = deconvLayer->_kernel_y;
+
+    int kernelStrideX = deconvLayer->_stride_x;
+    int kernelStrideY = deconvLayer->_stride_y;
+
+    auto paddings = getPaddings(*deconvLayer);
+    int padLeft = paddings.begin.exist(ie::X_AXIS) ? paddings.begin[ie::X_AXIS] : 0;
+    int padRight = paddings.end.exist(ie::X_AXIS) ? paddings.end[ie::X_AXIS] : padLeft;
+    int padTop = paddings.begin.exist(ie::Y_AXIS) ? paddings.begin[ie::Y_AXIS] : 0;
+    int padBottom = paddings.end.exist(ie::Y_AXIS) ? paddings.end[ie::Y_AXIS] : 0;
+
+    int dilationX = deconvLayer->_dilation_x;
+    int dilationY = deconvLayer->_dilation_y;
+
+    int groupSize = deconvLayer->_group;
+
+    //
+    // Create const datas
+    //
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    if ((groupSize == 0) ||
+        (groupSize > input->desc().dim(Dim::C)) ||
+        (input->desc().dim(Dim::C) % groupSize != 0) ||
+        (groupSize > output->desc().dim(Dim::C)) ||
+        (output->desc().dim(Dim::C) % groupSize != 0)) {
+        VPU_THROW_EXCEPTION << "DeconvolutionLayer has invalid group value";
+    }
+
+    Data weights, biases;
+    std::tie(weights, biases) = getWeightsAndBiases(model, layer);
+
+    IE_ASSERT(weights->desc().totalDimSize() >=
+              kernelSizeX * kernelSizeY * (input->desc().dim(Dim::C) / groupSize) * output->desc().dim(Dim::C));
+    weights = model->duplicateData(
+        weights,
+        "@deconv",
+        DataDesc({
+            kernelSizeX,
+            kernelSizeY,
+            input->desc().dim(Dim::C) / groupSize,
+            output->desc().dim(Dim::C)}));
+
+    if (biases->usage() != DataUsage::Fake) {
+        IE_ASSERT(biases->desc().totalDimSize() >= output->desc().dim(Dim::C));
+        biases = model->duplicateData(
+            biases,
+            "@deconv",
+            DataDesc({output->desc().dim(Dim::C)}));
+    }
+
+    //
+    // Create stub stage
+    //
+
+    auto stage = model->addNewStage<StubStage>(
+        layer->name,
+        StageType::StubDeconv,
+        layer,
+        {input, weights, biases},
+        {output});
+
+    stage->attrs().set<int>("kernelSizeX", kernelSizeX);
+    stage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+    stage->attrs().set<int>("kernelStrideX", kernelStrideX);
+    stage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+    stage->attrs().set<int>("padLeft", padLeft);
+    stage->attrs().set<int>("padRight", padRight);
+    stage->attrs().set<int>("padTop", padTop);
+    stage->attrs().set<int>("padBottom", padBottom);
+
+    stage->attrs().set<int>("dilationX", dilationX);
+    stage->attrs().set<int>("dilationY", dilationY);
+
+    stage->attrs().set<int>("groupSize", groupSize);
+    stage->attrs().set<bool>("tryHW", true);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/detection_output.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/detection_output.cpp
new file mode 100644 (file)
index 0000000..ac454c1
--- /dev/null
@@ -0,0 +1,298 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <memory>
+
+#include <vpu/compile_env.hpp>
+
+namespace vpu {
+
+namespace {
+
+enum PriorBox_CodeType {
+    CORNER = 1,
+    CENTER_SIZE,
+    CORNER_SIZE
+};
+
+VPU_PACKED(DetectionOutputParams {
+    int32_t num_classes;
+    int32_t share_location;
+    int32_t background_label_id;
+    float nms_threshold;
+    int32_t top_k;
+    int32_t code_type;
+    int32_t keep_top_k;
+    float confidence_threshold;
+    int32_t variance_encoded_in_target;
+    int32_t num_priors;
+    int32_t clip_before_nms;
+    int32_t clip_after_nms;
+    int32_t decrease_label_id;
+    int32_t image_width;
+    int32_t image_height;
+    int32_t normalized;
+    int32_t num;
+    float objectness_score;
+    int32_t has_arm_inputs;
+};)
+
+void printTo(std::ostream& os, const DetectionOutputParams& params) {
+    os << "[" << std::endl;
+    os << "num_classes=" << params.num_classes << std::endl;
+    os << "share_location=" << params.share_location << std::endl;
+    os << "background_label_id=" << params.background_label_id << std::endl;
+    os << "nms_threshold=" << params.nms_threshold << std::endl;
+    os << "top_k=" << params.top_k << std::endl;
+    os << "code_type=" << params.code_type << std::endl;
+    os << "keep_top_k=" << params.keep_top_k << std::endl;
+    os << "confidence_threshold=" << params.confidence_threshold << std::endl;
+    os << "variance_encoded_in_target=" << params.variance_encoded_in_target << std::endl;
+    os << "num_priors=" << params.num_priors << std::endl;
+    os << "clip_before_nms=" << params.clip_before_nms << std::endl;
+    os << "clip_after_nms=" << params.clip_after_nms << std::endl;
+    os << "decrease_label_id=" << params.decrease_label_id << std::endl;
+    os << "image_width=" << params.image_width << std::endl;
+    os << "image_height=" << params.image_height << std::endl;
+    os << "normalized=" << params.normalized << std::endl;
+    os << "num=" << params.num << std::endl;
+    os << "objectness_score=" << params.objectness_score << std::endl;
+    os << "has_arm_inputs=" << params.has_arm_inputs << std::endl;
+    os << "]";
+}
+
+void printTo(DotLabel& lbl, const DetectionOutputParams& params) {
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("num_classes", params.num_classes);
+    subLbl.appendPair("share_location", params.share_location);
+    subLbl.appendPair("background_label_id", params.background_label_id);
+    subLbl.appendPair("nms_threshold", params.nms_threshold);
+    subLbl.appendPair("top_k", params.top_k);
+    subLbl.appendPair("code_type", params.code_type);
+    subLbl.appendPair("keep_top_k", params.keep_top_k);
+    subLbl.appendPair("confidence_threshold", params.confidence_threshold);
+    subLbl.appendPair("variance_encoded_in_target", params.variance_encoded_in_target);
+    subLbl.appendPair("num_priors", params.num_priors);
+    subLbl.appendPair("clip_before_nms", params.clip_before_nms);
+    subLbl.appendPair("clip_after_nms", params.clip_after_nms);
+    subLbl.appendPair("decrease_label_id", params.decrease_label_id);
+    subLbl.appendPair("image_width", params.image_width);
+    subLbl.appendPair("image_height", params.image_height);
+    subLbl.appendPair("normalized", params.normalized);
+    subLbl.appendPair("num", params.num);
+    subLbl.appendPair("objectness_score", params.objectness_score);
+    subLbl.appendPair("has_arm_inputs", params.has_arm_inputs);
+}
+
+class DetectionOutputStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<DetectionOutputStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 3 || _inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto loc = _inputEdges[0]->input();
+        auto conf = _inputEdges[1]->input();
+        auto priors = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[loc] = 1.0f;
+        out[conf] = 1.0f;
+        out[priors] = 1.0f;
+        out[output] = 1.0f;
+
+        if (_inputEdges.size() == 5) {
+            out[_inputEdges[3]->input()] = 1.0f;
+            out[_inputEdges[4]->input()] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        return DataMap<DimsOrder>();
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3 || _inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto loc = _inputEdges[0]->input();
+        auto conf = _inputEdges[1]->input();
+        auto priors = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[loc] = StridesRequirement::compact();
+        out[conf] = StridesRequirement::compact();
+        out[priors] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        if (_inputEdges.size() == 5) {
+            out[_inputEdges[3]->input()] = StridesRequirement::compact();
+            out[_inputEdges[4]->input()] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        const auto& params = attrs().get<DetectionOutputParams>("params");
+
+        serializer.append(params);
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 3 || _inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.size() == 1);
+
+        auto loc = _inputEdges[0]->input();
+        auto conf = _inputEdges[1]->input();
+        auto priors = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        loc->serializeNewBuffer(serializer);
+        conf->serializeNewBuffer(serializer);
+        priors->serializeNewBuffer(serializer);
+        if (_inputEdges.size() == 5) {
+            _inputEdges[3]->input()->serializeNewBuffer(serializer);
+            _inputEdges[4]->input()->serializeNewBuffer(serializer);
+        }
+        output->serializeNewBuffer(serializer);
+
+        _tempBufferEdges[0]->tempBuffer()->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseDetectionOutput(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    const auto& env = CompileEnv::get();
+
+    IE_ASSERT(inputs.size() == 3 || inputs.size() == 5);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto loc = inputs[0];
+    auto conf = inputs[1];
+    auto priors = inputs[2];
+
+    DetectionOutputParams detParams;
+    detParams.num_classes = layer->GetParamAsInt("num_classes", 0);
+    detParams.background_label_id = layer->GetParamAsInt("background_label_id", 0);
+    detParams.top_k = layer->GetParamAsInt("top_k", -1);
+    detParams.variance_encoded_in_target = layer->GetParamAsInt("variance_encoded_in_target", 0);
+    detParams.keep_top_k = layer->GetParamAsInt("keep_top_k", -1);
+    detParams.nms_threshold = layer->GetParamAsFloat("nms_threshold", 0);
+    detParams.confidence_threshold = layer->GetParamAsFloat("confidence_threshold", -1.0f);
+    detParams.share_location = layer->GetParamAsInt("share_location", 1);
+    detParams.clip_before_nms = layer->GetParamAsInt("clip_before_nms", 0) || layer->GetParamAsInt("clip", 0);
+    detParams.clip_after_nms = layer->GetParamAsInt("clip_after_nms", 0);
+    detParams.decrease_label_id = layer->GetParamAsInt("decrease_label_id", 0);
+    detParams.normalized = layer->GetParamAsInt("normalized", 1);
+    detParams.image_height = layer->GetParamAsInt("input_height", 1);
+    detParams.image_width = layer->GetParamAsInt("input_width", 1);
+    detParams.objectness_score = layer->GetParamAsFloat("objectness_score", -1.0f);
+    detParams.has_arm_inputs = inputs.size() == 5 ? 1 : 0;
+
+    int prior_size = detParams.normalized ? 4 : 5;
+    int num_loc_classes = detParams.share_location ? 1 : detParams.num_classes;
+
+    detParams.num_priors = static_cast<int>(priors->desc().dim(Dim::W) / prior_size);
+    detParams.num = static_cast<int>(conf->desc().dim(Dim::N));
+
+    auto code_type_str = layer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CENTER_SIZE");
+    if (code_type_str.find("CORNER_SIZE") != std::string::npos) {
+        detParams.code_type = CORNER_SIZE;
+    } else if (code_type_str.find("CENTER_SIZE") != std::string::npos) {
+        detParams.code_type = CENTER_SIZE;
+    } else if (code_type_str.find("CORNER") != std::string::npos) {
+        detParams.code_type = CORNER;
+    } else {
+        VPU_THROW_EXCEPTION << "Unknown code_type " << code_type_str << " for DetectionOutput layer " << layer->name;
+    }
+
+    if (detParams.keep_top_k < 0)
+        detParams.keep_top_k = outputs[0]->desc().dim(Dim::H);
+
+    if (detParams.num_priors * num_loc_classes * 4 != loc->desc().dim(Dim::C))
+        VPU_THROW_EXCEPTION << "Detection Output: Number of priors must match number of location predictions.";
+
+    if (detParams.num_priors * detParams.num_classes != conf->desc().dim(Dim::C))
+        VPU_THROW_EXCEPTION << "Detection Output: Number of priors must match number of confidence predictions.";
+
+    if (detParams.decrease_label_id && detParams.background_label_id != 0)
+        VPU_THROW_EXCEPTION << "Detection Output: Cannot use decrease_label_id and background_label_id parameter simultaneously.";
+
+    if (outputs[0]->desc().dim(Dim::H) < detParams.keep_top_k)
+        VPU_THROW_EXCEPTION << "Detection Output: Output size more than output tensor.";
+
+    if (outputs[0]->desc().dim(Dim::W) != 7)
+        VPU_THROW_EXCEPTION << "Detection Output: Support only 7 vals per detection.";
+
+    auto stage = model->addNewStage<DetectionOutputStage>(
+        layer->name,
+        StageType::DetectionOutput,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set("params", detParams);
+
+    int _num = detParams.num;
+    int _num_classes = detParams.num_classes;
+    int _num_priors = detParams.num_priors;
+    int ALIGN_VALUE = 64;
+
+    int size_decoded_bboxes_buf    = sizeof(int16_t)*_num*_num_classes*_num_priors*4 + ALIGN_VALUE;
+    int size_buffer_buf            = sizeof(int32_t)*_num*_num_classes*_num_priors + ALIGN_VALUE;
+    int size_indices_buf           = sizeof(int32_t)*_num*_num_classes*_num_priors + ALIGN_VALUE;
+    int size_detections_count_buf  = sizeof(int32_t)*_num*_num_classes + ALIGN_VALUE;
+    int size_reordered_conf_buf    = sizeof(int16_t)     *_num_classes*_num_priors + ALIGN_VALUE;
+    int size_bbox_sizes_buf        = sizeof(int16_t)*_num*_num_classes*_num_priors + ALIGN_VALUE;
+    int size_num_priors_actual_buf = sizeof(int32_t)*_num + ALIGN_VALUE;
+    int size_temp_data_buf         = sizeof(int16_t)*env.resources.numSHAVEs*(_num_priors+8)*5 + ALIGN_VALUE;
+
+    int buffer_size =
+        size_decoded_bboxes_buf +
+        size_buffer_buf +
+        size_indices_buf +
+        size_detections_count_buf +
+        size_reordered_conf_buf +
+        size_bbox_sizes_buf +
+        size_num_priors_actual_buf +
+        size_temp_data_buf;
+
+    model->addTempBuffer(
+        stage,
+        DataDesc({buffer_size}));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp
new file mode 100644 (file)
index 0000000..3d8221a
--- /dev/null
@@ -0,0 +1,249 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+#include <limits>
+#include <algorithm>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class EltwiseStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<EltwiseStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (_type != StageType::Prod &&
+            step == ScalePropagationStep::Propagate) {
+            // Keep the largest input scale factor.
+            auto maxScale = std::numeric_limits<float>::lowest();
+            for (const auto& inEdge : _inputEdges) {
+                maxScale = std::max(maxScale, inputScales.at(inEdge->input()));
+            }
+
+            for (const auto& inEdge : _inputEdges) {
+                auto curScale = inputScales.at(inEdge->input());
+
+                if (!isFloatEqual(curScale, maxScale)) {
+                    out[inEdge->input()] = maxScale / curScale;
+                }
+            }
+
+            out[output] = maxScale;
+        } else {
+            // Eltwise can only propagate scaling for Sum and Max cases.
+            for (const auto& inEdge : _inputEdges) {
+                out[inEdge->input()] = 1.0f;
+            }
+
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto in0Desc = input0->desc();
+        auto in1Desc = input1->desc();
+        auto outDesc = output->desc();
+
+        auto finalOrder  = in0Desc.numDims() >= in1Desc.numDims() ? in0Desc.dimsOrder() : in1Desc.dimsOrder();
+        auto secondOrder = in0Desc.numDims() >= in1Desc.numDims() ? in1Desc.dimsOrder() : in0Desc.dimsOrder();
+        if (secondOrder.numDims() >= 3) {
+            if (secondOrder.dimInd(Dim::C) == 1 /*HCW*/) {
+                finalOrder = secondOrder;
+            } else if (secondOrder.dimInd(Dim::C) == 2 /*CHW*/ && finalOrder.dimInd(Dim::C) != 1 /*HCW*/) {
+                finalOrder = secondOrder;
+            }
+        }
+        if (outDesc.numDims() > finalOrder.numDims())
+            finalOrder = outDesc.dimsOrder();
+
+        DataMap<DimsOrder> out;
+
+        out[input0] = finalOrder.numDims() == in0Desc.numDims() ? finalOrder : in0Desc.dimsOrder();
+        out[input1] = finalOrder.numDims() == in1Desc.numDims() ? finalOrder : in1Desc.dimsOrder();
+        out[output] = finalOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::CanBeLimited;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto coeff1 = attrs().getOrDefault<float>("coeff1", 1.0f);
+        auto coeff2 = attrs().getOrDefault<float>("coeff2", 1.0f);
+
+        serializer.append(static_cast<float>(coeff1));
+        serializer.append(static_cast<float>(coeff2));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeNewBuffer(serializer, output->desc().dimsOrder());
+        output->serializeNewBuffer(serializer);
+        input1->serializeNewBuffer(serializer, output->desc().dimsOrder());
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseEltwise(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() >= 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::EltwiseLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto stageType = StageType::None;
+    auto subCoefficient = 1.0f;
+    switch (layer->_operation) {
+    case ie::EltwiseLayer::eOperation::Sum:
+        stageType = StageType::Sum;
+        break;
+    case ie::EltwiseLayer::eOperation::Prod:
+        stageType = StageType::Prod;
+        break;
+    case ie::EltwiseLayer::eOperation::Max:
+        stageType = StageType::Max;
+        break;
+    case ie::EltwiseLayer::eOperation::Sub:
+        if (inputs.size() > 2) {
+            VPU_THROW_EXCEPTION << "Eltwise operation: " << layer->_operation << " with multiple inputs is not supported";
+        }
+        stageType = StageType::Sum;
+        subCoefficient = -1.f;
+        break;
+    default:
+        VPU_THROW_EXCEPTION << "Eltwise operation" << layer->_operation << " is not supported";
+    }
+
+    if (stageType != StageType::Sum && !layer->coeff.empty()) {
+        VPU_THROW_EXCEPTION << layer->name << " coefficients only supported for Sum/Sub operations.";
+    }
+
+    auto output = outputs[0];
+
+    auto tempOutput = output;
+    if (inputs.size() > 2) {
+        tempOutput = model->duplicateData(
+            output,
+            formatString("@temp@1/%d", inputs.size() - 2));
+    }
+
+    DataVector tempInputs(2);
+    tempInputs[0] = inputs[0];
+    tempInputs[1] = inputs[1];
+
+    auto stage = model->addNewStage<EltwiseStage>(
+        layer->name,
+        stageType,
+        layer,
+        tempInputs,
+        {tempOutput});
+
+    if (layer->coeff.size() > 0) {
+        stage->attrs().set<float>("coeff1", layer->coeff[0]);
+    }
+    if (layer->coeff.size() > 1 || subCoefficient != 1.0f) {
+        stage->attrs().set<float>("coeff2", subCoefficient * (layer->coeff.size() > 1 ? layer->coeff[1] : 1.0f));
+    }
+
+    tempInputs[0] = tempOutput;
+    for (int ind = 2; ind < inputs.size(); ++ind) {
+        tempInputs[1] = inputs[ind];
+
+        if (ind + 1 == inputs.size()) {
+            tempOutput = output;
+        } else {
+            tempOutput = model->duplicateData(
+                output,
+                formatString("@temp@%d/%d", ind, inputs.size() - 2));
+        }
+
+        stage = model->addNewStage<EltwiseStage>(
+            layer->name + "@" + std::to_string(ind - 1),
+            stageType,
+            layer,
+            tempInputs,
+            {tempOutput});
+
+        if (layer->coeff.size() > ind) {
+            stage->attrs().set<float>("coeff2", layer->coeff[ind]);
+        }
+
+        tempInputs[0] = tempOutput;
+    }
+}
+
+Stage StageBuilder::addSumStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input0,
+        const Data& input1,
+        const Data& output) {
+    return model->addNewStage<EltwiseStage>(
+        name,
+        StageType::Sum,
+        layer,
+        {input0, input1},
+        {output});
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/elu.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/elu.cpp
new file mode 100644 (file)
index 0000000..cef1287
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class EluStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<EluStage>(*this);
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto alpha = attrs().get<float>("alpha");
+
+        serializer.append(static_cast<float>(alpha));
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseELU(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto alpha = layer->GetParamAsFloat("alpha", 1.0f);
+
+    auto stage = model->addNewStage<EluStage>(
+        layer->name,
+        StageType::Elu,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<float>("alpha", alpha);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/expand.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/expand.cpp
new file mode 100644 (file)
index 0000000..11f5b00
--- /dev/null
@@ -0,0 +1,154 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <set>
+#include <unordered_set>
+#include <algorithm>
+
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ExpandStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ExpandStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto dimsOrder = output->desc().dimsOrder();
+
+        //
+        // Get smallest Dim over which Expand is done.
+        //
+
+        auto minExpandDimInd = dimsOrder.numDims();
+
+        for (const auto& p : output->desc().dims()) {
+            if (input->desc().dim(p.first) != p.second) {
+                minExpandDimInd = std::min(minExpandDimInd, dimsOrder.dimInd(p.first));
+            }
+        }
+
+        IE_ASSERT(minExpandDimInd < dimsOrder.numDims());
+
+        //
+        // Initial StridesRequirement for input and output.
+        //
+
+        auto outputReqs = output->requiredStrides();
+
+        auto inputReqs = outputReqs;
+        for (int i = minExpandDimInd + 1; i < dimsOrder.numDims(); ++i) {
+            inputReqs.remove(i);
+        }
+
+        //
+        // Merge output consumers StridesRequirement.
+        //
+
+        for (const auto& consumer : output->consumers()) {
+            auto consumerInfo = consumer->getDataStridesRequirements();
+
+            auto consumerStrideIt = consumerInfo.find(output);
+            if (consumerStrideIt != consumerInfo.end()) {
+                auto consumerReqs = consumerStrideIt->second;
+
+                for (int i = 0; i < minExpandDimInd + 1; ++i) {
+                    if (outputReqs.get(i) == DimStride::Any) {
+                        if (consumerReqs.get(i) != DimStride::Any) {
+                            inputReqs.add(i, consumerReqs.get(i));
+                            outputReqs.add(i, consumerReqs.get(i));
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Return merged StridesRequirements.
+        //
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = inputReqs;
+        out[output] = outputReqs;
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addExpandStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output,
+        const DimValues& offset) {
+    auto stage = model->addNewStage<ExpandStage>(
+        name,
+        StageType::Expand,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<DimValues>("offset", offset);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/fc.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/fc.cpp
new file mode 100644 (file)
index 0000000..a297450
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parseFullyConnected(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    const auto& env = CompileEnv::get();
+
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::FullyConnectedLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto total_out_num = layer->_out_num * output->desc().dim(Dim::N);
+    if (total_out_num != output->desc().totalDimSize()) {
+        VPU_THROW_EXCEPTION
+                << "Layer Name: " << layer->name << " Layer type: " << layer->type
+                << " has incorrect _out_num param. Expected: " << output->desc().totalDimSize()
+                << " Actual: " << layer->_out_num;
+    }
+
+    //
+    // Check if HW is applicable
+    //
+
+    auto tryHW = env.config.hwOptimization;
+
+    if (output->desc().dim(Dim::W, 1) != 1 || output->desc().dim(Dim::H, 1) != 1) {
+        tryHW = false;
+    }
+
+    if (env.netConfig.hwDisabled(layer->name)) {
+        tryHW = false;
+    }
+
+    if (input->desc().dim(Dim::C) == 71 &&
+        input->desc().dim(Dim::H, 1) == 1 &&
+        input->desc().dim(Dim::W, 1) == 88) {
+        tryHW = false;
+    }
+
+    if (output->desc().totalDimSize() == 1) {
+        tryHW = false;
+    }
+
+    //
+    // Create const datas
+    //
+
+    Data weights, biases;
+    std::tie(weights, biases) = getWeightsAndBiases(model, layer);
+
+    IE_ASSERT(weights->desc().totalDimSize() >=
+              input->desc().totalDimSize() / input->desc().dim(Dim::N, 1) * layer->_out_num);
+    weights = model->duplicateData(
+        weights,
+        "@fc",
+        DataDesc({
+            input->desc().dim(Dim::W, 1) * input->desc().dim(Dim::H, 1),
+            input->desc().dim(Dim::C),
+            static_cast<int>(layer->_out_num)}));
+
+    if (biases->usage() != DataUsage::Fake) {
+        IE_ASSERT(biases->desc().totalDimSize() >= output->desc().dim(Dim::C));
+        biases = model->duplicateData(
+            biases,
+            "@fc",
+            DataDesc({output->desc().dim(Dim::C)}));
+    }
+
+    //
+    // Create stub stage
+    //
+
+    auto stage = model->addNewStage<StubStage>(
+        layer->name,
+        StageType::StubFullyConnected,
+        layer,
+        {input, weights, biases},
+        {output});
+
+    stage->attrs().set<bool>("tryHW", tryHW);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/grn.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/grn.cpp
new file mode 100644 (file)
index 0000000..370ae2c
--- /dev/null
@@ -0,0 +1,120 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class GRNStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<GRNStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto bias = attrs().get<float>("bias");
+
+        serializer.append(static_cast<float>(bias));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseGRN(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::GRNLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto stage = model->addNewStage<GRNStage>(
+        layer->name,
+        StageType::GRN,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<float>("bias", layer->bias);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/interp.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/interp.cpp
new file mode 100644 (file)
index 0000000..43adcab
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class InterpStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<InterpStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto align_corners = attrs().get<bool>("align_corners");
+
+        serializer.append(static_cast<int32_t>(align_corners));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseInterp(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<InterpStage>(
+        layer->name,
+        StageType::Interp,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<bool>("align_corners", layer->GetParamAsInt("align_corners", 0));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/mtcnn.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/mtcnn.cpp
new file mode 100644 (file)
index 0000000..0b2693d
--- /dev/null
@@ -0,0 +1,335 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <fstream>
+#include <string>
+#include <utility>
+#include <memory>
+#include <set>
+
+#include <cpp/ie_cnn_net_reader.h>
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/file_system.hpp>
+
+namespace vpu {
+
+// Must be synchronized with MvTensor
+VPU_DECLARE_ENUM(MTCNN_Mode,
+    AVA_FaceDetector = 0,
+    Public = 1)
+
+namespace {
+
+class MTCNNStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<MTCNNStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto innerGraphs = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[innerGraphs] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input] = input->desc().dimsOrder().createMovedDim(Dim::C, 2);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 0);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto debug_pnet_post_nms = attrs().get<int>("debug_pnet_post_nms");
+        auto debug_rnet_post_nms = attrs().get<int>("debug_rnet_post_nms");
+        auto mode = attrs().get<MTCNN_Mode>("mode");
+        const auto& pyramid = attrs().get<std::vector<std::pair<int, int>>>("pyramid");
+        auto stage2_zdir_batch_size = attrs().get<int>("stage2_zdir_batch_size");
+
+        serializer.append(static_cast<int32_t>(pyramid.size()));
+        for (const auto& elem : pyramid) {
+            serializer.append(static_cast<int32_t>(elem.first));
+            serializer.append(static_cast<int32_t>(elem.second));
+        }
+
+        serializer.append(static_cast<int32_t>(debug_pnet_post_nms));
+        serializer.append(static_cast<int32_t>(debug_rnet_post_nms));
+        serializer.append(static_cast<int32_t>(mode));
+        serializer.append(static_cast<int32_t>(attrs().get<int>("stage2_zdir_batch_size")));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+
+        input1->serializeOldBuffer(
+            handle_from_this(),
+            serializer,
+            DimsOrder::HWC,
+            {{Dim::W, {Dim::C}}});
+    }
+};
+
+class MTCNNBlobContent final : public DataContent {
+public:
+    explicit MTCNNBlobContent(std::vector<char>&& blob) : _blob(std::forward<std::vector<char>>(blob)) {
+        IE_ASSERT(!_blob.empty());
+    }
+
+    const void* getRaw() const override {
+        IE_ASSERT(_desc.totalDimSize() * _desc.elemSize() == _blob.size());
+        return _blob.data();
+    }
+
+private:
+    std::vector<char> _blob;
+};
+
+std::pair<int, int> getResolution(const std::string& str) {
+    std::istringstream stream(str);
+    std::string output;
+    std::getline(stream, output, 'x');
+    auto width = std::stoi(output);
+    std::getline(stream, output, 'x');
+    auto height = std::stoi(output);
+    return std::make_pair(width, height);
+}
+
+ie::CNNNetwork loadSubNetwork(
+        const std::string& fileName,
+        const std::pair<int, int>& imgSize, int* zdir_batchsize = nullptr) {
+    //
+    // Load network
+    //
+
+    auto binFileName = fileNameNoExt(fileName) + ".bin";
+
+    ie::CNNNetReader networkReader;
+    networkReader.ReadNetwork(fileName);
+    networkReader.ReadWeights(binFileName);
+
+    auto network = networkReader.getNetwork();
+
+    //
+    // Set precision of input/output
+    //
+
+    auto networkInputs = network.getInputsInfo();
+    IE_ASSERT(networkInputs.size() == 1);
+
+    auto networkOutputs = network.getOutputsInfo();
+    IE_ASSERT(networkOutputs.size() == 1);
+
+    networkInputs.begin()->second->setPrecision(ie::Precision::FP16);
+    networkInputs.begin()->second->setLayout(ie::Layout::NCHW);
+
+    networkOutputs.begin()->second->setPrecision(ie::Precision::FP16);
+    networkOutputs.begin()->second->setLayout(ie::Layout::NCHW);
+
+    //
+    // Change input shape
+    //
+
+    auto inputShapes = network.getInputShapes();
+    IE_ASSERT(inputShapes.size() == 1);
+
+    std::string inputName;
+    ie::SizeVector inputShape;
+    std::tie(inputName, inputShape) = *inputShapes.begin();
+    if (zdir_batchsize != nullptr)
+        *zdir_batchsize = inputShape[1]/3;
+    inputShape[0] = 1;                // set batch size to the first input dimension
+    inputShape[2] = imgSize.second;   // changes input height to the image one
+    inputShape[3] = imgSize.first;    // changes input width to the image one
+    inputShapes[inputName] = inputShape;
+
+    network.reshape(inputShapes);
+
+    return network;
+}
+
+}  // namespace
+
+void FrontEnd::parseMTCNN(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    const auto& env = CompileEnv::get();
+
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    if (!env.config.hwOptimization) {
+        VPU_THROW_EXCEPTION << "MTCNN layer supports Myriad X with NCE only";
+    }
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto modeStr = layer->GetParamAsString("mode", "PUBLIC_MTCNN");
+
+    auto pnet_ir_name = layer->GetParamAsString("pnet_ir");
+    auto rnet_ir_name = layer->GetParamAsString("rnet_ir");
+    auto onet_ir_name = layer->GetParamAsString("onet_ir");
+    auto pnet_resolutions_str = layer->GetParamAsString("pnet_resolutions");
+
+    std::pair<int, int> r_net_input = {24, 24};
+    std::pair<int, int> o_net_input = {48, 48};
+
+    std::vector<std::pair<int, int>> pyramid;
+
+    std::istringstream stream(pnet_resolutions_str);
+    std::string str;
+    while (getline(stream, str, ',')) {
+        pyramid.emplace_back(getResolution(str));
+    }
+
+    // Assert that the first stage in the pyramid is the largest one
+    for (const auto& p_net_input : pyramid) {
+        if (p_net_input.first > pyramid[0].first || p_net_input.second > pyramid[0].second) {
+            VPU_THROW_EXCEPTION << "MTCNN layer: first stage in pyramid should be the largest one";
+        }
+    }
+
+    std::vector<CompiledGraph::Ptr> compiledSubNetworks;
+    compiledSubNetworks.reserve(pyramid.size() + 2);
+
+    //
+    // Compile sub-networks with std::async to avoid current CompileEnv modification.
+    //
+    size_t mergedBlobSize = 0;
+
+    // Convert p-nets
+    for (const auto& p_net_input : pyramid) {
+        auto pNet = loadSubNetwork(pnet_ir_name, p_net_input);
+        auto res = compileSubNetwork(pNet, env.config);
+        mergedBlobSize += res->blob.size();
+        compiledSubNetworks.emplace_back(std::move(res));
+    }
+
+    int stage2_zdir_batchsize = 1;
+    // Convert r-net
+    {
+        auto rNet = loadSubNetwork(rnet_ir_name, r_net_input, &stage2_zdir_batchsize);
+        auto res = compileSubNetwork(rNet, env.config);
+        mergedBlobSize += res->blob.size();
+        compiledSubNetworks.emplace_back(std::move(res));
+    }
+
+    // Convert o-net
+    {
+        auto oNet = loadSubNetwork(onet_ir_name, o_net_input);
+        auto res = compileSubNetwork(oNet, env.config);
+        mergedBlobSize += res->blob.size();
+        compiledSubNetworks.emplace_back(std::move(res));
+    }
+
+    //
+    // Merge sub networks blobs
+    //
+
+    std::vector<char> mergedBlob(mergedBlobSize);
+
+    size_t curOffset = 0;
+    for (const auto& subRes : compiledSubNetworks) {
+        std::copy_n(subRes->blob.data(), subRes->blob.size(), mergedBlob.data() + curOffset);
+        curOffset += subRes->blob.size();
+    }
+
+    auto innerGraphsDesc = DataDesc({mergedBlob.size()});
+    innerGraphsDesc.setType(DataType::U8);
+
+    auto innerGraphs = model->addConstData(
+        layer->name + "@innerGraphs",
+        innerGraphsDesc,
+        std::make_shared<MTCNNBlobContent>(std::move(mergedBlob)));
+
+    auto stage = model->addNewStage<MTCNNStage>(
+        layer->name,
+        StageType::MTCNN,
+        layer,
+        {input, innerGraphs},
+        {output});
+
+    stage->attrs().set("pyramid", pyramid);
+    stage->attrs().set<int>("debug_pnet_post_nms", layer->GetParamAsInt("debug_pnet_post_nms", 0));
+    stage->attrs().set<int>("debug_rnet_post_nms", layer->GetParamAsInt("debug_rnet_post_nms", 0));
+    stage->attrs().set<MTCNN_Mode>("mode", cmp(modeStr, "AVA_FaceDetector") ? MTCNN_Mode::AVA_FaceDetector : MTCNN_Mode::Public);
+    stage->attrs().set<int>("stage2_zdir_batch_size", stage2_zdir_batchsize);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/mvn.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/mvn.cpp
new file mode 100644 (file)
index 0000000..cae5410
--- /dev/null
@@ -0,0 +1,133 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class MVNStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<MVNStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto normalize = attrs().get<int>("normalize");
+        auto across_channels = attrs().get<int>("across_channels");
+
+        serializer.append(static_cast<int32_t>(normalize));
+        serializer.append(static_cast<int32_t>(across_channels));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseMVN(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::MVNLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    float def_eps = 1e-9f;
+    float eps = layer->GetParamAsFloat("eps", def_eps);
+
+    if (eps > 1e-7f) {
+        VPU_THROW_EXCEPTION
+            << "Layer " << layer->name << " [" << layer->type
+            <<  "] in our kernel we use const value 1e-9f. Actual = " << eps;
+    }
+
+    auto stage = model->addNewStage<MVNStage>(
+        layer->name,
+        StageType::MVN,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("normalize", layer->normalize);
+    stage->attrs().set<int>("across_channels", layer->across_channels);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/none.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/none.cpp
new file mode 100644 (file)
index 0000000..0755a5d
--- /dev/null
@@ -0,0 +1,79 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <string>
+#include <vector>
+#include <list>
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class NoneStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<NoneStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        DataMap<float> out;
+
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        return DataMap<DimsOrder>();
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::NotNeeded;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addNoneStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    return model->addNewStage<NoneStage>(
+        name,
+        StageType::None,
+        layer,
+        {inputs},
+        {outputs});
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/norm.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/norm.cpp
new file mode 100644 (file)
index 0000000..6cb2666
--- /dev/null
@@ -0,0 +1,151 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+#include <precision_utils.h>
+
+namespace vpu {
+
+namespace {
+
+class LRNStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<LRNStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        // LRN supports both HWC and CHW orders, but requires that input and output have the same stride
+
+        auto reqs = StridesRequirement::compact();
+        if (_type == StageType::LRN &&
+            input->desc().dimsOrder().dimInd(Dim::C) != 0) {
+            reqs.add(1, DimStride::Aligned);
+        }
+
+        out[input] = reqs;
+        out[output] = reqs;
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto size = attrs().get<int>("size");
+        auto k = attrs().get<int>("k");
+        auto alpha = attrs().get<float>("alpha");
+        auto beta = attrs().get<float>("beta");
+
+        serializer.append(static_cast<uint32_t>(size));
+        serializer.append(ie::PrecisionUtils::f32tof16(k));
+        serializer.append(ie::PrecisionUtils::f32tof16(alpha));
+        serializer.append(ie::PrecisionUtils::f32tof16(beta));
+        serializer.append(ie::PrecisionUtils::f32tof16(0));  // for alignment
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseNorm(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::NormLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto stage = model->addNewStage<LRNStage>(
+        layer->name,
+        layer->_isAcrossMaps ? StageType::LRN : StageType::InnerLRN,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("size", layer->_size);
+    stage->attrs().set<int>("k", layer->_k);
+    stage->attrs().set<float>("alpha", layer->_alpha);
+    stage->attrs().set<float>("beta", layer->_beta);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/normalize.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/normalize.cpp
new file mode 100644 (file)
index 0000000..ca3fc11
--- /dev/null
@@ -0,0 +1,147 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <map>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class NormalizeStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<NormalizeStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto scales = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[scales] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto acrossSpatial = attrs().get<bool>("acrossSpatial");
+        auto channelShared = attrs().get<bool>("channelShared");
+        auto eps = attrs().get<float>("eps");
+
+        serializer.append(static_cast<int32_t>(acrossSpatial));
+        serializer.append(static_cast<int32_t>(channelShared));
+        serializer.append(static_cast<float>(eps));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto scales = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+        scales->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseNormalize(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto acrossSpatial = layer->GetParamAsInt("across_spatial", 0);
+    auto channelShared = layer->GetParamAsInt("channel_shared", 0);
+    float eps = layer->GetParamAsFloat("eps", 0.0f);
+
+    auto weightsIt = layer->blobs.find("weights");
+    if (weightsIt == layer->blobs.end()) {
+        VPU_THROW_EXCEPTION << "Missing weights for " << layer->name << " layer";
+    }
+
+    auto weightsBlob = weightsIt->second;
+    IE_ASSERT(weightsBlob != nullptr);
+
+    auto output = outputs[0];
+
+    auto scales = model->addConstData(
+        layer->name + "@scales",
+        DataDesc({weightsBlob->size()}),
+        ieBlobContent(weightsBlob));
+
+    auto stage = model->addNewStage<NormalizeStage>(
+        layer->name,
+        StageType::Normalize,
+        layer,
+        {inputs[0], scales},
+        outputs);
+
+    stage->attrs().set<bool>("acrossSpatial", acrossSpatial);
+    stage->attrs().set<bool>("channelShared", channelShared);
+    stage->attrs().set<float>("eps", eps);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/pad.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/pad.cpp
new file mode 100644 (file)
index 0000000..31be091
--- /dev/null
@@ -0,0 +1,195 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <string>
+#include <vector>
+#include <list>
+#include <set>
+#include <unordered_set>
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class PadStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PadStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            out[output] = inputScales.at(input);
+        } else {
+            // Copy can only propagate scaling.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        // TODO: try merge with last dimension
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::CanBeLimited;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+
+        auto perm = input->desc().dimsOrder().toPermutation();
+        IE_ASSERT(perm.size() <= 4);
+
+        auto pad_value = attrs().get<float>("pad_value");
+        auto pad_mode = attrs().get<PadMode>("pad_mode");
+        const auto& pads_begin = attrs().get<DimValues>("pads_begin");
+        const auto& pads_end = attrs().get<DimValues>("pads_end");
+
+        int i = 0;
+        for (; i < perm.size(); ++i) {
+            serializer.append(static_cast<uint32_t>(pads_begin.get(perm[i], 0)));
+            serializer.append(static_cast<uint32_t>(pads_end.get(perm[i], 0)));
+        }
+        for (; i < 4; ++i) {
+            serializer.append(static_cast<uint32_t>(0));
+            serializer.append(static_cast<uint32_t>(0));
+        }
+
+        serializer.append(static_cast<float>(pad_value));
+        serializer.append(static_cast<uint32_t>(pad_mode));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parsePad(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::PadLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    IE_ASSERT(layer->pads_begin.size() == 4);
+    IE_ASSERT(layer->pads_end.size() == 4);
+
+    DimValues pads_begin;
+    pads_begin.set(Dim::W, layer->pads_begin[3]);
+    pads_begin.set(Dim::H, layer->pads_begin[2]);
+    pads_begin.set(Dim::C, layer->pads_begin[1]);
+    pads_begin.set(Dim::N, layer->pads_begin[0]);
+
+    DimValues pads_end;
+    pads_end.set(Dim::W, layer->pads_end[3]);
+    pads_end.set(Dim::H, layer->pads_end[2]);
+    pads_end.set(Dim::C, layer->pads_end[1]);
+    pads_end.set(Dim::N, layer->pads_end[0]);
+
+    _stageBuilder->addPadStage(
+        model,
+        layer->name,
+        layer,
+        static_cast<PadMode>(layer->pad_mode),
+        layer->pad_value,
+        pads_begin,
+        pads_end,
+        inputs[0],
+        outputs[0]);
+}
+
+Stage StageBuilder::addPadStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        PadMode padMode,
+        float pad_value,
+        const DimValues& pads_begin,
+        const DimValues& pads_end,
+        const Data& input,
+        const Data& output) {
+    auto stage = model->addNewStage<PadStage>(
+        name,
+        StageType::Pad,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<float>("pad_value", pad_value);
+    stage->attrs().set<PadMode>("pad_mode", padMode);
+    stage->attrs().set<DimValues>("pads_begin", pads_begin);
+    stage->attrs().set<DimValues>("pads_end", pads_end);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/permute.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/permute.cpp
new file mode 100644 (file)
index 0000000..a67d86e
--- /dev/null
@@ -0,0 +1,160 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <utility>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+template <class Cont1, class Cont2>
+std::vector<typename Cont1::value_type> permuteArray(const Cont1& src, const Cont2& permutation) {
+    std::vector<typename Cont1::value_type> out(permutation.size());
+
+    for (int i = 0; i < out.size(); i++) {
+        auto newInd = static_cast<int>(permutation[i]);
+
+        IE_ASSERT(newInd >= 0);
+        IE_ASSERT(newInd < src.size());
+
+        out[i] = src[newInd];
+    }
+
+    return out;
+}
+
+class PermuteStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PermuteStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            out[output] = inputScales.at(input);
+        } else {
+            // Copy can only propagate scaling.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::CanBeLimited;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+
+        const auto& order = attrs().get<std::vector<int>>("order");
+
+        auto perm = input->desc().dimsOrder().toPermutation();
+        auto ind = input->desc().dimsOrder().toIndices();
+
+        auto dimPerm = permuteArray(order, perm);
+        auto memoryOrderPerm = permuteArray(ind.toVector(-1), dimPerm);
+
+        int i = 0;
+        for (i = 0; i < memoryOrderPerm.size(); i++) {
+            serializer.append(static_cast<uint32_t>(memoryOrderPerm[i]));
+        }
+        for (; i < MAX_DIMS_32; i++) {
+            serializer.append(static_cast<uint32_t>(-1));
+        }
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parsePermute(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto ieOrder = layer->GetParamAsInts("order");
+
+    auto maxIeOrder = *std::max_element(ieOrder.begin(), ieOrder.end());
+
+    std::vector<int> vpuOrder(MAX_DIMS_64, -1);
+    for (size_t i = 0; i < ieOrder.size(); i++) {
+        vpuOrder[i] = maxIeOrder - ieOrder[ieOrder.size() - 1 - i];
+    }
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto stage = model->addNewStage<PermuteStage>(
+        layer->name,
+        StageType::Permute,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<std::vector<int>>("order", vpuOrder);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/pooling.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/pooling.cpp
new file mode 100644 (file)
index 0000000..27fc0b5
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+#include <ie_layers_internal.hpp>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/stub_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parsePooling(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    const auto& env = CompileEnv::get();
+
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    if (!(input->desc().numDims() == 3 || input->desc().numDims() == 4)) {
+        VPU_THROW_EXCEPTION << "Pooling supports only 3D or 4D input";
+    }
+    if (output->desc().numDims() != input->desc().numDims()) {
+        VPU_THROW_EXCEPTION << "Pooling supports only same num dims in input and output";
+    }
+
+    //
+    // Extract parameters
+    //
+
+    auto poolLayer = std::dynamic_pointer_cast<ie::PoolingLayer>(layer);
+    IE_ASSERT(poolLayer != nullptr);
+
+    int kernelSizeX = poolLayer->_kernel_x;
+    int kernelSizeY = poolLayer->_kernel_y;
+
+    int kernelStrideX = poolLayer->_stride_x;
+    int kernelStrideY = poolLayer->_stride_y;
+
+    auto paddings = getPaddings(*poolLayer);
+    int padLeft = paddings.begin.exist(ie::X_AXIS) ? paddings.begin[ie::X_AXIS] : 0;
+    int padRight = paddings.end.exist(ie::X_AXIS) ? paddings.end[ie::X_AXIS] : padLeft;
+    int padTop = paddings.begin.exist(ie::Y_AXIS) ? paddings.begin[ie::Y_AXIS] : 0;
+    int padBottom = paddings.end.exist(ie::Y_AXIS) ? paddings.end[ie::Y_AXIS] : padTop;
+
+    auto poolType = poolLayer->_type;
+
+    auto excludePad = poolLayer->_exclude_pad;
+
+    //
+    // Check if HW is applicable
+    //
+
+    auto stageType = StageType::None;
+    auto tryHW = env.config.hwOptimization;
+
+    if (poolType == ie::PoolingLayer::MAX) {
+        stageType = StageType::StubMaxPool;
+    } else if (poolType == ie::PoolingLayer::AVG) {
+        stageType = StageType::StubAvgPool;
+    } else {
+        VPU_THROW_EXCEPTION << "Pooling Layer " << poolLayer->name << " has unsupported type: " << poolType;
+    }
+
+    // HW restrictions
+    if (kernelStrideX != kernelStrideY) {
+        tryHW = false;
+    }
+
+    // check if HW pooling has correct output size
+    {
+        int iw = input->desc().dim(Dim::W);
+        int ih = input->desc().dim(Dim::H);
+
+        int ow = output->desc().dim(Dim::W);
+        int oh = output->desc().dim(Dim::H);
+
+        // take additional hw paddings into account
+        if ((iw % 2 == 1) && (kernelSizeX % 2 == 0) && (padRight == 0)) iw++;
+        if ((ih % 2 == 1) && (kernelSizeY % 2 == 0) && (padBottom == 0)) ih++;
+
+        int tempX = iw + (padLeft + padRight) - kernelSizeX;
+        int tempY = ih + (padBottom + padTop) - kernelSizeY;
+
+        int outWidthWithOutCeil = (tempX + kernelStrideX) / kernelStrideX;
+        int outHeightWithOutCeil = (tempY + kernelStrideX) / kernelStrideX;
+
+        int outWidthWithCeil =  static_cast<int>(std::ceil(static_cast<double>(tempX) / kernelStrideX + 1));
+        int outHeightWithCeil = static_cast<int>(std::ceil(static_cast<double>(tempY) / kernelStrideX + 1));
+
+        if ((ow != outWidthWithCeil) && (ow != outWidthWithOutCeil)) {
+            tryHW = false;
+        }
+
+        if ((oh != outHeightWithCeil) && (oh != outHeightWithOutCeil)) {
+            tryHW = false;
+        }
+    }
+
+    // HW restrictions
+    if (kernelSizeX > 15 ||
+        kernelSizeY > 15 ||
+        kernelStrideX > 8) {
+        tryHW = false;
+    }
+
+    // TODO: 3x3s2 Avg pooling is not supported by HW
+    if (kernelSizeX == 3 && kernelSizeY == 3 && kernelStrideX == 2 && poolType == ie::PoolingLayer::AVG) {
+        tryHW = false;
+    }
+
+    // TODO: Avg pooling with even kernel size and odd input is not supported
+    if ((kernelSizeX % 2 == 0 || kernelSizeY % 2 == 0)) {
+        if (input->desc().dim(Dim::W) % 2 == 1 || input->desc().dim(Dim::H) % 2 == 1) {
+            if (poolType == ie::PoolingLayer::PoolType::AVG) {
+                tryHW = false;
+            }
+        }
+    }
+
+    // TODO : 5x5s3 Avg pooling hangs device
+    if (kernelSizeX == 5 && kernelSizeY == 5 && kernelStrideX == 3 && poolType == ie::PoolingLayer::PoolType::AVG) {
+        tryHW = false;
+    }
+
+    // TODO : 2x2s2 1278x718 HW MAX pool works worse than SW version
+    if ((kernelSizeX % 2 == 0 || kernelSizeY % 2 == 0)) {
+        if (input->desc().dim(Dim::W) > 1000 || input->desc().dim(Dim::H) > 700) {
+            tryHW = false;
+        }
+    }
+
+    //  FIX #14949, enable HW AVG pooling, need SW postproc
+    if (excludePad && poolType == ie::PoolingLayer::PoolType::AVG) {
+        if (output->desc().dim(Dim::W) == 5 &&
+            output->desc().dim(Dim::H) == 5 &&
+            kernelSizeX == 5 &&
+            kernelSizeY == 5) {
+            tryHW = false;
+        }
+    }
+
+
+    if (env.netConfig.hwDisabled(layer->name)) {
+        tryHW = false;
+    }
+
+    //
+    // Create stub stage
+    //
+
+    auto stage = model->addNewStage<StubStage>(
+        layer->name,
+        stageType,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<int>("kernelSizeX", kernelSizeX);
+    stage->attrs().set<int>("kernelSizeY", kernelSizeY);
+
+    stage->attrs().set<int>("kernelStrideX", kernelStrideX);
+    stage->attrs().set<int>("kernelStrideY", kernelStrideY);
+
+    stage->attrs().set<int>("padLeft", padLeft);
+    stage->attrs().set<int>("padRight", padRight);
+    stage->attrs().set<int>("padTop", padTop);
+    stage->attrs().set<int>("padBottom", padBottom);
+
+    stage->attrs().set<bool>("excludePad", excludePad);
+
+    stage->attrs().set<bool>("tryHW", tryHW);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/power.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/power.cpp
new file mode 100644 (file)
index 0000000..dde6e49
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+#include <string>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+void FrontEnd::parsePower(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto layer = std::dynamic_pointer_cast<ie::PowerLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    _stageBuilder->addPowerStage(
+        model,
+        layer->name,
+        layer,
+        layer->scale,
+        layer->power,
+        layer->offset,
+        inputs[0],
+        outputs[0]);
+}
+
+namespace {
+
+class PowerStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PowerStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto power = attrs().get<float>("power");
+        auto& scale = attrs().get<float>("scale");
+        auto& bias = attrs().get<float>("bias");
+
+        DataMap<float> out;
+
+        if (power != 1.0f) {
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        } else {
+            auto inputScale = inputScales.at(input);
+
+            out[output] = inputScale;
+
+            if (step == ScalePropagationStep::ScaleInput) {
+                scale *= inputScale;
+            }
+            if (step != ScalePropagationStep::Check) {
+                bias *= inputScale;
+            }
+        }
+
+        return out;
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto scale = attrs().get<float>("scale");
+        auto power = attrs().get<float>("power");
+        auto bias = attrs().get<float>("bias");
+
+        serializer.append(static_cast<float>(bias));
+        serializer.append(static_cast<float>(scale));
+        serializer.append(static_cast<float>(power));
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addPowerStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        float scale,
+        float power,
+        float bias,
+        const Data& input,
+        const Data& output) {
+    auto stage = model->addNewStage<PowerStage>(
+        name,
+        StageType::Power,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<float>("scale", scale);
+    stage->attrs().set<float>("power", power);
+    stage->attrs().set<float>("bias", bias);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/prelu.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/prelu.cpp
new file mode 100644 (file)
index 0000000..0ba04b9
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PReluStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PReluStage>(*this);
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parsePReLU(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto weightsIt = layer->blobs.find("weights");
+    if (weightsIt == layer->blobs.end()) {
+        THROW_IE_EXCEPTION << "[VPU] PReLU doesn't have weights";
+    }
+
+    auto weightsBlob = weightsIt->second;
+    IE_ASSERT(weightsBlob != nullptr);
+
+    auto channelShared = layer->GetParamAsInt("channel_shared", 0);
+
+    auto output = outputs[0];
+
+    auto weights = model->addConstData(
+        layer->name + "@weights",
+        DataDesc({output->desc().dim(Dim::C)}),
+        ieBlobContent(
+            weightsBlob,
+            channelShared ? output->desc().dim(Dim::C) : 1));
+
+    model->addNewStage<PReluStage>(
+        layer->name,
+        StageType::PRelu,
+        layer,
+        {inputs[0], weights},
+        outputs);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/priorbox.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/priorbox.cpp
new file mode 100644 (file)
index 0000000..4005663
--- /dev/null
@@ -0,0 +1,224 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <cmath>
+
+#include <algorithm>
+#include <vector>
+#include <memory>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PriorBoxContent final : public CalculatedDataContent {
+public:
+    PriorBoxContent(
+            const DataDesc& inDesc0,
+            const DataDesc& inDesc1,
+            const DataDesc& outDesc,
+            const ie::CNNLayerPtr& layer) :
+            _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+            _layer(layer) {
+        IE_ASSERT(layer != nullptr);
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
+        VPU_PROFILE(PriorBoxContent);
+
+        auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+        auto _min_sizes = _layer->GetParamAsFloats("min_size", {});
+        auto _max_sizes = _layer->GetParamAsFloats("max_size", {});
+        auto aspect_ratios = _layer->GetParamAsFloats("aspect_ratio");
+        auto _flip = static_cast<bool>(_layer->GetParamAsInt("flip"));
+        auto _clip = static_cast<bool>(_layer->GetParamAsInt("clip"));
+        auto _variance = _layer->GetParamAsFloats("variance");
+        auto _img_h = _layer->GetParamAsInt("img_h", 0);
+        auto _img_w = _layer->GetParamAsInt("img_w", 0);
+        auto _step = _layer->GetParamAsFloat("step", 0);
+        auto _step_h = _layer->GetParamAsFloat("step_h", 0);
+        auto _step_w = _layer->GetParamAsFloat("step_w", 0);
+        auto _offset = _layer->GetParamAsFloat("offset", 0);
+        auto _scale_all_sizes = static_cast<bool>(_layer->GetParamAsInt("scale_all_sizes", 1));
+
+        std::vector<float> _aspect_ratios;
+        _aspect_ratios.reserve(aspect_ratios.size() + 1);
+
+        _aspect_ratios.push_back(1.0f);
+        for (auto aspect_ratio : aspect_ratios) {
+            bool exist = false;
+
+            for (float _aspect_ratio : _aspect_ratios) {
+                if (fabs(aspect_ratio - _aspect_ratio) < 1e-6) {
+                    exist = true;
+                    break;
+                }
+            }
+            if (!exist) {
+                _aspect_ratios.push_back(aspect_ratio);
+                if (_flip) {
+                    _aspect_ratios.push_back(1.0f / aspect_ratio);
+                }
+            }
+        }
+
+        int _num_priors;
+        if (_scale_all_sizes) {
+            _num_priors = static_cast<int>(_aspect_ratios.size() * _min_sizes.size());
+        } else {
+            _num_priors = static_cast<int>(_aspect_ratios.size() + _min_sizes.size() - 1);
+        }
+
+        for (auto it = _max_sizes.begin(); it != _max_sizes.end(); it++) {
+            _num_priors += 1;
+        }
+
+        auto W  = _inDesc0.dim(Dim::W);
+        auto H  = _inDesc0.dim(Dim::H);
+        auto IW = _img_w == 0 ? _inDesc1.dim(Dim::W) : _img_w;
+        auto IH = _img_h == 0 ? _inDesc1.dim(Dim::H) : _img_h;
+
+        auto OW = (_outDesc.numDims() >= 4) ? _outDesc.dim(Dim::N) : 1;
+        auto OH = _outDesc.dim(Dim::W);
+
+        float step_x = 0.0f;
+        float step_y = 0.0f;
+
+        if (_step == 0) {
+            step_x = static_cast<float>(IW) / W;
+            step_y = static_cast<float>(IH) / H;
+        } else {
+            step_x = _step;
+            step_y = _step;
+        }
+
+        auto dst_data = tempPtr;
+
+        int dim = H * W * _num_priors * 4;
+        float center_x = 0.0f;
+        float center_y = 0.0f;
+
+        float box_width;
+        float box_height;
+
+        if (_outDesc.dim(Dim::W) != dim || _outDesc.dim(Dim::H) != 2) {
+            THROW_IE_EXCEPTION << "[VPU] PriorBox output have invalid dimension, exptected " << dim << "x2"
+                               << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
+        }
+
+        int idx = 0;
+        for (int h = 0; h < H; ++h) {
+            for (int w = 0; w < W;  ++w) {
+                for (size_t msIdx = 0; msIdx < _min_sizes.size(); msIdx++) {
+                    if (_step == 0) {
+                        center_x = (w + 0.5f) * step_x;
+                        center_y = (h + 0.5f) * step_y;
+                    } else {
+                        center_x = (_offset + w) * _step;
+                        center_y = (_offset + h) * _step;
+                    }
+
+                    box_width = _min_sizes[msIdx];
+                    box_height = _min_sizes[msIdx];
+
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+
+                    if (_max_sizes.size() > msIdx) {
+                        box_width = box_height = sqrt(_min_sizes[msIdx] * _max_sizes[msIdx]);
+
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+                    }
+
+                    if (_scale_all_sizes || (!_scale_all_sizes && (msIdx == _min_sizes.size() - 1))) {
+                        size_t sIdx = _scale_all_sizes ? msIdx : 0;
+                        for (float ar : _aspect_ratios) {
+                            if (fabs(ar - 1.0f) < 1e-6) {
+                                continue;
+                            }
+
+                            box_width = _min_sizes[sIdx] * sqrt(ar);
+                            box_height = _min_sizes[sIdx] / sqrt(ar);
+
+                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+                        }
+                    }
+                }
+            }
+        }
+
+        if (_clip) {
+            for (int d = 0; d < dim; ++d) {
+                dst_data[d] = (std::min)((std::max)(dst_data[d], ie::PrecisionUtils::f32tof16(0.0f)), ie::PrecisionUtils::f32tof16(1.0f));
+            }
+        }
+
+        int channel_size = OH * OW;
+
+        dst_data += channel_size;
+
+        if (_variance.size() == 1) {
+            ie::parallel_for(channel_size, [&](int i) {
+                dst_data[i] = ie::PrecisionUtils::f32tof16(_variance[0]);
+            });
+        } else {
+            ie::parallel_for4d(H, W, _num_priors, 4, [&](int h, int w, int i, int j) {
+                dst_data[j + 4 * (i + _num_priors * (w + W * h))] = ie::PrecisionUtils::f32tof16(_variance[j]);
+            });
+        }
+    }
+
+private:
+    DataDesc _inDesc0;
+    DataDesc _inDesc1;
+    DataDesc _outDesc;
+    ie::CNNLayerPtr _layer;
+};
+
+}  // namespace
+
+void FrontEnd::parsePriorBox(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input0 = inputs[0];
+    auto input1 = inputs[1];
+    auto output = outputs[0];
+
+    auto resultData = model->addConstData(
+        output->name(),
+        output->desc(),
+        std::make_shared<PriorBoxContent>(input0->desc(), input1->desc(), output->desc(), layer));
+
+    if (output->usage() == DataUsage::Output || output->numConsumers() > 0) {
+        _stageBuilder->addCopyStage(model, layer->name, layer, resultData, output);
+    } else {
+        IE_ASSERT(output->usage() == DataUsage::Intermediate);
+
+        bindData(resultData, output->origData());
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/priorbox_clustered.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/priorbox_clustered.cpp
new file mode 100644 (file)
index 0000000..ab4e8e0
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <algorithm>
+#include <vector>
+#include <memory>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class PriorBoxClusteredContent final : public CalculatedDataContent {
+public:
+    PriorBoxClusteredContent(
+            const DataDesc& inDesc0,
+            const DataDesc& inDesc1,
+            const DataDesc& outDesc,
+            const ie::CNNLayerPtr& layer) :
+            _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+            _layer(layer) {
+        IE_ASSERT(layer != nullptr);
+    }
+
+protected:
+    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
+        VPU_PROFILE(PriorBoxClusteredContent);
+
+        auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+        auto widths_ = _layer->GetParamAsFloats("width");
+        auto heights_ = _layer->GetParamAsFloats("height");
+        auto clip_ = _layer->GetParamAsInt("clip");
+        auto variance_ = _layer->GetParamAsFloats("variance");
+        auto img_h_ = _layer->GetParamAsInt("img_h", 0);
+        auto img_w_ = _layer->GetParamAsInt("img_w", 0);
+        auto step_ = _layer->GetParamAsFloat("step", 0);
+        auto step_h_ = _layer->GetParamAsFloat("step_h", 0);
+        auto step_w_ = _layer->GetParamAsFloat("step_w", 0);
+        auto offset_ = _layer->GetParamAsFloat("offset", 0);
+
+        auto num_priors_ = widths_.size();
+
+        if (variance_.empty()) {
+            variance_.push_back(0.1);
+        }
+
+        auto layer_width  = _inDesc0.dim(Dim::W);
+        auto layer_height = _inDesc0.dim(Dim::H);
+
+        auto img_width  = img_w_ == 0 ? _inDesc1.dim(Dim::W) : img_w_;
+        auto img_height = img_h_ == 0 ? _inDesc1.dim(Dim::H) : img_h_;
+
+        auto step_w = step_w_ == 0 ? step_ : step_w_;
+        auto step_h = step_h_ == 0 ? step_ : step_h_;
+        if (step_w == 0 || step_h == 0) {
+            step_w = static_cast<float>(img_width) / layer_width;
+            step_h = static_cast<float>(img_height) / layer_height;
+        }
+
+        auto expetected_output_dimx = layer_height * layer_width * num_priors_ * 4;
+        if (_outDesc.dim(Dim::W) != expetected_output_dimx || _outDesc.dim(Dim::H) != 2) {
+            THROW_IE_EXCEPTION << "PriorBoxClustered output has invalid dimension, exptected " << expetected_output_dimx << "x2"
+                               << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
+        }
+
+        auto offset = _outDesc.dim(Dim::W);
+        auto var_size = variance_.size();
+
+        auto top_data_0 = tempPtr;
+        auto top_data_1 = top_data_0 + offset;
+
+        ie::parallel_for2d(layer_height, layer_width, [=](int h, int w) {
+            auto center_x = (w + offset_) * step_w;
+            auto center_y = (h + offset_) * step_h;
+
+            for (int s = 0; s < num_priors_; ++s) {
+                auto box_width  = widths_[s];
+                auto box_height = heights_[s];
+
+                auto xmin = (center_x - box_width  / 2.0f) / img_width;
+                auto ymin = (center_y - box_height / 2.0f) / img_height;
+                auto xmax = (center_x + box_width  / 2.0f) / img_width;
+                auto ymax = (center_y + box_height / 2.0f) / img_height;
+
+                if (clip_) {
+                    xmin = std::min(std::max(xmin, 0.0f), 1.0f);
+                    ymin = std::min(std::max(ymin, 0.0f), 1.0f);
+                    xmax = std::min(std::max(xmax, 0.0f), 1.0f);
+                    ymax = std::min(std::max(ymax, 0.0f), 1.0f);
+                }
+
+                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 0] = ie::PrecisionUtils::f32tof16(xmin);
+                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 1] = ie::PrecisionUtils::f32tof16(ymin);
+                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 2] = ie::PrecisionUtils::f32tof16(xmax);
+                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 3] = ie::PrecisionUtils::f32tof16(ymax);
+
+                for (int j = 0; j < var_size; j++) {
+                    auto index = h * layer_width * num_priors_ * var_size + w * num_priors_ * var_size + s * var_size + j;
+                    top_data_1[index] = ie::PrecisionUtils::f32tof16(variance_[j]);
+                }
+            }
+        });
+    }
+
+private:
+    DataDesc _inDesc0;
+    DataDesc _inDesc1;
+    DataDesc _outDesc;
+    ie::CNNLayerPtr _layer;
+};
+
+}  // namespace
+
+void FrontEnd::parsePriorBoxClustered(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input0 = inputs[0];
+    auto input1 = inputs[1];
+    auto output = outputs[0];
+
+    auto resultData = model->addConstData(
+        output->name(),
+        output->desc(),
+        std::make_shared<PriorBoxClusteredContent>(input0->desc(), input1->desc(), output->desc(), layer));
+
+    if (output->usage() == DataUsage::Output || output->numConsumers() > 0) {
+        _stageBuilder->addCopyStage(model, layer->name, layer, resultData, output);
+    } else {
+        IE_ASSERT(output->usage() == DataUsage::Intermediate);
+
+        bindData(resultData, output->origData());
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/proposal.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/proposal.cpp
new file mode 100644 (file)
index 0000000..a28df94
--- /dev/null
@@ -0,0 +1,224 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+#include <details/caseless.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ProposalStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ProposalStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto input2 = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input0] = 1.0f;
+        out[input1] = 1.0f;
+        out[input2] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+
+        DataMap<DimsOrder> out;
+
+        out[input0] = input0->desc().dimsOrder().createMovedDim(Dim::C, 2);
+        out[input1] = input1->desc().dimsOrder().createMovedDim(Dim::C, 2);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto input2 = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input0] = StridesRequirement::compact();
+        out[input1] = StridesRequirement::compact();
+        out[input2] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto feat_stride = attrs().get<int>("feat_stride");
+        auto base_size = attrs().get<int>("base_size");
+        auto min_size = attrs().get<int>("min_size");
+        auto pre_nms_topn = attrs().get<int>("pre_nms_topn");
+        auto post_nms_topn = attrs().get<int>("post_nms_topn");
+        auto nms_thresh = attrs().get<float>("nms_thresh");
+        auto pre_nms_thresh = attrs().get<float>("pre_nms_thresh");
+        auto box_size_scale = attrs().get<float>("box_size_scale");
+        auto box_coordinate_scale = attrs().get<float>("box_coordinate_scale");
+        auto coordinates_offset = attrs().get<float>("coordinates_offset");
+        auto initial_clip = attrs().get<bool>("initial_clip");
+        auto clip_before_nms = attrs().get<bool>("clip_before_nms");
+        auto clip_after_nms = attrs().get<bool>("clip_after_nms");
+        auto normalize = attrs().get<bool>("normalize");
+
+        auto shift_anchors = attrs().get<bool>("shift_anchors");
+        auto round_ratios = attrs().get<bool>("round_ratios");
+        auto swap_xy = attrs().get<bool>("swap_xy");
+        const auto& scales = attrs().get<std::vector<float>>("scales");
+        const auto& ratios = attrs().get<std::vector<float>>("ratios");
+
+        serializer.append(static_cast<uint32_t>(feat_stride));
+        serializer.append(static_cast<uint32_t>(base_size));
+        serializer.append(static_cast<uint32_t>(min_size));
+        serializer.append(static_cast<int32_t>(pre_nms_topn));
+        serializer.append(static_cast<int32_t>(post_nms_topn));
+        serializer.append(static_cast<float>(nms_thresh));
+        serializer.append(static_cast<float>(pre_nms_thresh));
+        serializer.append(static_cast<float>(box_size_scale));
+        serializer.append(static_cast<float>(box_coordinate_scale));
+        serializer.append(static_cast<float>(coordinates_offset));
+        serializer.append(static_cast<uint32_t>(initial_clip));
+        serializer.append(static_cast<uint32_t>(clip_before_nms));
+        serializer.append(static_cast<uint32_t>(clip_after_nms));
+        serializer.append(static_cast<uint32_t>(normalize));
+        serializer.append(static_cast<uint32_t>(shift_anchors));
+        serializer.append(static_cast<uint32_t>(round_ratios));
+        serializer.append(static_cast<uint32_t>(swap_xy));
+
+        auto serializeVector = [&serializer](const std::vector<float>& array) {
+            serializer.append(static_cast<uint32_t>(array.size()));
+            for (auto elem : array) {
+                serializer.append(static_cast<float>(elem));
+            }
+        };
+
+        serializeVector(scales);
+        serializeVector(ratios);
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto input2 = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+        input1->serializeNewBuffer(serializer);
+        input2->serializeNewBuffer(serializer);
+        _tempBufferEdges[0]->tempBuffer()->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseProposal(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(inputs.size() == 3);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<ProposalStage>(
+        layer->name,
+        StageType::Proposal,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("feat_stride", layer->GetParamAsInt("feat_stride", 16));
+    stage->attrs().set<int>("base_size", layer->GetParamAsInt("base_size", 16));
+    stage->attrs().set<int>("min_size", layer->GetParamAsInt("min_size", 16));
+    stage->attrs().set<int>("pre_nms_topn", layer->GetParamAsInt("pre_nms_topn", 6000));
+    stage->attrs().set<int>("post_nms_topn", layer->GetParamAsInt("post_nms_topn", 300));
+    stage->attrs().set<float>("nms_thresh", layer->GetParamAsFloat("nms_thresh", 0.7f));
+    stage->attrs().set<float>("pre_nms_thresh", layer->GetParamAsFloat("pre_nms_thresh", 0.1f));
+    stage->attrs().set<float>("box_size_scale", layer->GetParamAsFloat("box_size_scale", 1.0f));
+    stage->attrs().set<float>("box_coordinate_scale", layer->GetParamAsFloat("box_coordinate_scale", 1.0f));
+    stage->attrs().set<bool>("clip_before_nms", layer->GetParamAsBool("clip_before_nms", true));
+    stage->attrs().set<bool>("clip_after_nms", layer->GetParamAsBool("clip_after_nms", false));
+    stage->attrs().set<bool>("normalize", layer->GetParamAsBool("normalize", false));
+
+    if (cmp(layer->GetParamAsString("framework", ""), "TensorFlow")) {
+        // Settings for TensorFlow
+        stage->attrs().set<float>("coordinates_offset", 0.0f);
+        stage->attrs().set<bool>("initial_clip", true);
+        stage->attrs().set<bool>("shift_anchors", true);
+        stage->attrs().set<bool>("round_ratios", false);
+        stage->attrs().set<bool>("swap_xy", true);
+    } else {
+        // Settings for Caffe
+
+        stage->attrs().set<float>("coordinates_offset", 1.0f);
+        stage->attrs().set<bool>("initial_clip", false);
+        stage->attrs().set<bool>("shift_anchors", false);
+        stage->attrs().set<bool>("round_ratios", true);
+        stage->attrs().set<bool>("swap_xy", false);
+    }
+
+    auto scales = layer->GetParamAsFloats("scale", {});
+    auto ratios = layer->GetParamAsFloats("ratio", {});
+
+    stage->attrs().set("scales", scales);
+    stage->attrs().set("ratios", ratios);
+
+    int number_of_anchors = ratios.size() * scales.size();
+
+    // Allocate slightly larger buffer than needed for handling remnant in distribution among SHAVEs
+    int buffer_size = (inputs[0]->desc().dim(Dim::H) + 16) * inputs[0]->desc().dim(Dim::W) * number_of_anchors * 5 * sizeof(float);
+
+    model->addTempBuffer(
+        stage,
+        DataDesc({buffer_size}));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/psroipooling.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/psroipooling.cpp
new file mode 100644 (file)
index 0000000..dd1c564
--- /dev/null
@@ -0,0 +1,131 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class PSROIPoolingStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<PSROIPoolingStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input0] = 1.0f;
+        out[input1] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input0] = input0->desc().dimsOrder().createMovedDim(Dim::C, 2);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 2);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input0] = StridesRequirement::compact();
+        out[input1] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto group_size = attrs().get<int>("group_size");
+        auto output_dim = attrs().get<int>("output_dim");
+        auto spatial_scale = attrs().get<float>("spatial_scale");
+
+        serializer.append(static_cast<uint32_t>(group_size));
+        serializer.append(static_cast<uint32_t>(output_dim));
+        serializer.append(static_cast<float>(spatial_scale));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+        input1->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parsePSROIPooling(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<PSROIPoolingStage>(
+        layer->name,
+        StageType::PSROIPooling,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("group_size", layer->GetParamAsInt("group_size", 7));
+    stage->attrs().set<int>("output_dim", layer->GetParamAsInt("output_dim", 21));
+    stage->attrs().set<float>("spatial_scale", layer->GetParamAsFloat("spatial_scale", 0.0625f));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/region_yolo.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/region_yolo.cpp
new file mode 100644 (file)
index 0000000..b786240
--- /dev/null
@@ -0,0 +1,144 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class RegionYoloStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<RegionYoloStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        if (!attrs().get<bool>("doSoftMax")) {
+            out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 2);  // CHW
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+
+        DataMap<StridesRequirement> out;
+
+        if (attrs().get<bool>("doSoftMax")) {
+            // Major dimension must be compact.
+            out[input] = StridesRequirement().add(2, DimStride::Compact);
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto classes = attrs().get<int>("classes");
+        auto coords = attrs().get<int>("coords");
+        auto num = attrs().get<int>("num");
+        auto maskSize = attrs().get<int>("maskSize");
+        auto doSoftMax = attrs().get<bool>("doSoftMax");
+
+        serializer.append(static_cast<int32_t>(classes));
+        serializer.append(static_cast<int32_t>(coords));
+        serializer.append(static_cast<int32_t>(num));
+        serializer.append(static_cast<int32_t>(maskSize));
+        serializer.append(static_cast<int32_t>(doSoftMax));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseRegionYolo(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto mask = layer->GetParamAsInts("mask", {});
+
+    auto stage = model->addNewStage<RegionYoloStage>(
+        layer->name,
+        StageType::RegionYolo,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("classes", layer->GetParamAsInt("classes", 20));
+    stage->attrs().set<int>("coords", layer->GetParamAsInt("coords", 4));
+    stage->attrs().set<int>("num", layer->GetParamAsInt("num", 5));
+    stage->attrs().set<int>("maskSize", static_cast<int>(mask.size()));
+    stage->attrs().set<bool>("doSoftMax", layer->GetParamAsInt("do_softmax", 1));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/relu.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/relu.cpp
new file mode 100644 (file)
index 0000000..1d18320
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <cmath>
+
+#include <vector>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ReLUStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ReLUStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input);
+
+            out[output] = inputScale;
+        } else {
+            // ReLU can only propagate scaling, not generate.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto negativeSlope = attrs().get<float>("negativeSlope");
+
+        serializer.append(static_cast<uint32_t>(_inputEdges.size() == 2));
+        serializer.append(negativeSlope);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseReLU(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::ReLULayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    _stageBuilder->addReLUStage(model, layer->name, layer, layer->negative_slope, inputs[0], outputs[0]);
+}
+
+Stage StageBuilder::addReLUStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        float negativeSlope,
+        const Data& input,
+        const Data& output,
+        const Data& biases) {
+    auto stageType = StageType::__SPECIAL_START__;
+    if (biases == nullptr) {
+        stageType =
+            std::fabs(negativeSlope) < std::numeric_limits<float>::epsilon() ?
+                StageType::Relu :
+                StageType::LeakyRelu;
+    } else {
+        stageType =
+            std::fabs(negativeSlope) < std::numeric_limits<float>::epsilon() ?
+                StageType::BiasRelu :
+                StageType::BiasLeakyRelu;
+    }
+
+    auto stage = model->addNewStage<ReLUStage>(
+        name,
+        stageType,
+        layer,
+        {input},
+        {output});
+
+    if (biases != nullptr) {
+        model->addStageInput(stage, biases);
+    }
+
+    stage->attrs().set<float>("negativeSlope", negativeSlope);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/reorg_yolo.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/reorg_yolo.cpp
new file mode 100644 (file)
index 0000000..763fe55
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class ReorgYoloStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ReorgYoloStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            out[output] = inputScales.at(input);
+        } else {
+            // ReorgYolo can only propagate scaling.
+
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inOrder = input->desc().dimsOrder();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = inOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inOrder = input->desc().dimsOrder();
+
+        DataMap<StridesRequirement> out;
+
+        if (inOrder.dimInd(Dim::C) == 0) {
+            out[input] = StridesRequirement::compact();
+            out[output] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto stride = attrs().get<int>("stride");
+
+        serializer.append(static_cast<int32_t>(stride));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseReorgYolo(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<ReorgYoloStage>(
+        layer->name,
+        StageType::ReorgYolo,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("stride", layer->GetParamAsInt("stride", 2));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/resample.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/resample.cpp
new file mode 100644 (file)
index 0000000..7cf682a
--- /dev/null
@@ -0,0 +1,139 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+#include <string>
+
+namespace vpu {
+
+VPU_DECLARE_ENUM(ResampleType,
+    Nearest  = 0,
+    Bilinear = 1
+)
+
+namespace {
+
+class ResampleStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ResampleStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        // TODO: check if the stage can propagate scale factors
+
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<BatchSupport> out;
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+
+        return out;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto antialias = attrs().get<bool>("antialias");
+        auto factor = attrs().get<float>("factor");
+        auto sampleType = attrs().get<ResampleType>("type");
+
+        serializer.append(static_cast<int32_t>(antialias));
+        serializer.append(static_cast<float>(factor));
+        serializer.append(static_cast<uint32_t>(sampleType));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeOldBuffer(handle_from_this(), serializer);
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseResample(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    ie::details::CaselessEq<std::string> cmp;
+
+    auto stage = model->addNewStage<ResampleStage>(
+        layer->name,
+        StageType::Resample,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<bool>("antialias", layer->GetParamAsInt("antialias", 0));
+    stage->attrs().set<float>("factor", layer->GetParamAsInt("factor", -1.0f));
+
+    auto method = layer->GetParamAsString("type", "caffe.ResampleParameter.NEAREST");
+    if (cmp(method, "caffe.ResampleParameter.NEAREST")) {
+        stage->attrs().set<ResampleType>("type", ResampleType::Nearest);
+    } else {
+        stage->attrs().set<ResampleType>("type", ResampleType::Bilinear);
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/reshape.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/reshape.cpp
new file mode 100644 (file)
index 0000000..17ee81f
--- /dev/null
@@ -0,0 +1,130 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+#include <string>
+
+namespace vpu {
+
+namespace {
+
+class ReshapeStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ReshapeStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            out[output] = inputScales.at(input);
+        } else {
+            // Reshape can only propagate scaling.
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        // Only default order is supported
+        out[input] = DimsOrder::fromNumDims(input->desc().numDims());
+        out[output] = DimsOrder::fromNumDims(output->desc().numDims());
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        IE_ASSERT(input->desc().totalDimSize() == output->desc().totalDimSize());
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseReshape(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+    _stageBuilder->addReshapeStage(model, layer->name, layer, inputs[0], outputs[0]);
+}
+
+Stage StageBuilder::addReshapeStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output) {
+    IE_ASSERT(input->desc().totalDimSize() == output->desc().totalDimSize());
+
+    return model->addNewStage<ReshapeStage>(
+        name,
+        StageType::Reshape,
+        layer,
+        {input},
+        {output});
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/rnn.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/rnn.cpp
new file mode 100644 (file)
index 0000000..8502e22
--- /dev/null
@@ -0,0 +1,237 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <set>
+
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+namespace {
+
+class LSTMCellStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<LSTMCellStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 2);
+
+        DataMap<float> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            out[inEdge->input()] = 1.0f;
+        }
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 2);
+
+        auto output = _outputEdges[0]->output();
+        auto input = _inputEdges[0]->input();
+
+        DimsOrder inputDimsOrder = input->desc().dimsOrder();
+        DimsOrder outputDimsOrder = output->desc().dimsOrder();
+
+        if (inputDimsOrder.numDims() >= 3) inputDimsOrder.moveDim(Dim::C, 2);  // ->...CHW
+        if (outputDimsOrder.numDims() >= 3) outputDimsOrder.moveDim(Dim::C, 2);
+
+        DataMap<DimsOrder> out;
+        out[input] = inputDimsOrder;
+        out[output] = outputDimsOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 2);
+
+        DataMap<StridesRequirement> out;
+
+        for (const auto& inEdge : _inputEdges) {
+            out[inEdge->input()] = StridesRequirement::compact();
+        }
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = StridesRequirement::compact();
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto RNNForward = attrs().get<bool>("RNNForward");
+        auto nCells = attrs().get<int>("nCells");
+        auto nBatches = attrs().get<int>("nBatches");
+        serializer.append(static_cast<int>(RNNForward));
+        serializer.append(static_cast<int>(nCells));
+        serializer.append(static_cast<int>(nBatches));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 5);
+        IE_ASSERT(_outputEdges.size() == 2);
+
+        int nCells = attrs().get<int>("nCells");
+
+        bool useTempBuffer = (nCells > 1);
+        IE_ASSERT((_tempBufferEdges.size() == 1 && useTempBuffer) || !useTempBuffer);
+
+        for (const auto& inEdge : _inputEdges) {
+            inEdge->input()->serializeNewBuffer(serializer);
+        }
+        for (const auto& outEdge : _outputEdges) {
+            outEdge->output()->serializeNewBuffer(serializer);
+        }
+
+        if (useTempBuffer)
+            _tempBufferEdges[0]->tempBuffer()->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+static void RNNRelayout(
+                 const fp16_t* src,
+                 fp16_t* dst0,
+                 fp16_t* dst1,
+
+                 const int ngates,
+                 const int state_size,
+                 const int input_size
+                ) {
+    int counter = 0;
+    for (int j = 0; j < ngates * state_size; j++) {
+        for (int i = 0; i < input_size; i++) {
+            dst0[(input_size) * j + i] = src[counter++];
+        }
+        for (int i = 0; i < state_size; i++) {
+            dst1[(state_size) * j + i] = src[counter++];
+        }
+    }
+}
+
+void FrontEnd::parseRNN(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector &inputs,
+        const DataVector &outputs) {
+    IE_ASSERT(inputs.size() == 3);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::RNNSequenceLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    const int ngates = 4;
+
+    Data weights, biases;
+    std::tie(weights, biases) = getWeightsAndBiases(model, layer);
+
+    size_t nCells = inputs[0]->desc().dim(Dim::H);
+    size_t nBatches = inputs[0]->desc().dim(Dim::C);
+    IE_ASSERT(nCells >= 1);
+    IE_ASSERT(nBatches >= 1);
+
+    size_t input_size = inputs[0]->desc().dim(Dim::W);
+    IE_ASSERT(input_size == inputs[0]->desc().totalDimSize() / nCells / nBatches);
+
+    size_t state_size = inputs[1]->desc().totalDimSize() / nBatches;
+    size_t cell_state_size = inputs[2]->desc().totalDimSize() / nBatches;
+    IE_ASSERT(state_size == cell_state_size);
+
+    size_t weightsSize = weights->desc().totalDimSize();
+    IE_ASSERT(state_size * (input_size + state_size) * ngates == weightsSize);
+
+    size_t biasesSize = biases->desc().totalDimSize();
+    IE_ASSERT(state_size * ngates == biasesSize);
+
+    /* weights repacking */
+    auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {weightsSize});
+    newWeightsBlob->allocate();
+    auto newWeightsPtr = newWeightsBlob->buffer().as<fp16_t*>();
+    auto content = weights->content();
+    IE_ASSERT(content != nullptr);
+    auto origWeights = content->get<fp16_t>();
+    IE_ASSERT(origWeights != nullptr);
+    RNNRelayout(origWeights,
+                newWeightsPtr,
+                newWeightsPtr + ngates * state_size * input_size,
+
+                ngates,
+                state_size,
+                input_size);
+
+    auto newWeights = model->addConstData(
+        _layer->name + "@weights",
+        weights->desc(),
+        ieBlobContent(newWeightsBlob));
+
+    auto stateCellFinal = model->addFakeData();
+    auto stage = model->addNewStage<LSTMCellStage>(
+        layer->name,
+        StageType::LSTMCell,
+        layer,
+        {inputs[0], inputs[1], inputs[2], newWeights, biases},
+        {outputs[0], stateCellFinal});
+
+    if (nCells > 1)
+        model->addTempBuffer(stage, DataDesc({state_size}));
+
+    bool RNNForward = layer->direction == ie::RNNSequenceLayer::FWD;
+    stage->attrs().set<bool>("RNNForward", RNNForward);
+    stage->attrs().set<int>("nCells", nCells);
+    stage->attrs().set<int>("nBatches", nBatches);
+}
+
+void FrontEnd::parseLSTMCell(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector &inputs,
+        const DataVector &outputs) {
+    IE_ASSERT(inputs.size() == 3);
+    IE_ASSERT(outputs.size() == 2);
+
+    auto layer = std::dynamic_pointer_cast<ie::LSTMCell>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    Data weights, biases;
+    std::tie(weights, biases) = getWeightsAndBiases(model, layer);
+
+    auto stage = model->addNewStage<LSTMCellStage>(
+            layer->name,
+            StageType::LSTMCell,
+            layer,
+            {inputs[0], inputs[1], inputs[2], weights, biases},
+            outputs);
+    stage->attrs().set<bool>("RNNForward", true);
+    stage->attrs().set<int>("nCells", 1);
+    stage->attrs().set<int>("nBatches", 1);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/roipooling.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/roipooling.cpp
new file mode 100644 (file)
index 0000000..68a22cb
--- /dev/null
@@ -0,0 +1,150 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <cstdio>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+VPU_DECLARE_ENUM(ROIPoolingMethod,
+    Max = 0,
+    Bilinear = 1
+)
+
+namespace {
+
+class ROIPoolingStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ROIPoolingStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input0] = 1.0f;
+        out[input1] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[input0] = input0->desc().dimsOrder().createMovedDim(Dim::C, 2);
+        out[output] = output->desc().dimsOrder().createMovedDim(Dim::C, 2);
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<StridesRequirement> out;
+
+        out[input0] = StridesRequirement::compact();
+        out[input1] = StridesRequirement::compact();
+        out[output] = StridesRequirement::compact();
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        auto pooled_w = attrs().get<int>("pooled_w");
+        auto pooled_h = attrs().get<int>("pooled_h");
+        auto spatial_scale = attrs().get<float>("spatial_scale");
+        auto method = attrs().get<ROIPoolingMethod>("method");
+
+        serializer.append(static_cast<uint32_t>(pooled_w));
+        serializer.append(static_cast<uint32_t>(pooled_h));
+        serializer.append(static_cast<float>(spatial_scale));
+        serializer.append(static_cast<uint32_t>(method));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 2);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input0 = _inputEdges[0]->input();
+        auto input1 = _inputEdges[1]->input();
+        auto output = _outputEdges[0]->output();
+
+        input0->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+        input1->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseROIPooling(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    ie::details::CaselessEq<std::string> cmp;
+
+    IE_ASSERT(inputs.size() == 2);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto stage = model->addNewStage<ROIPoolingStage>(
+        layer->name,
+        StageType::ROIPooling,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set<int>("pooled_w", layer->GetParamAsInt("pooled_w", 7));
+    stage->attrs().set<int>("pooled_h", layer->GetParamAsInt("pooled_h", 7));
+    stage->attrs().set<float>("spatial_scale", layer->GetParamAsFloat("spatial_scale", 0.0625f));
+
+    auto method = layer->GetParamAsString("method", "max");
+    if (cmp(method, "bilinear")) {
+        stage->attrs().set("method", ROIPoolingMethod::Bilinear);
+    } else {
+        stage->attrs().set("method", ROIPoolingMethod::Max);
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/scale.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/scale.cpp
new file mode 100644 (file)
index 0000000..14f888c
--- /dev/null
@@ -0,0 +1,108 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+#include <string>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ScaleStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ScaleStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 2 || _inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto scales = _inputEdges[1]->input();
+        auto biases = _inputEdges.size() == 3 ? _inputEdges[2]->input() : nullptr;
+        auto output = _outputEdges[0]->output();
+
+        auto inputScale = inputScales.at(input);
+
+        DataMap<float> out;
+
+        out[scales] = step == ScalePropagationStep::Propagate ? 1.0f : inputScale;
+        if (biases != nullptr) {
+            out[biases] = inputScale;
+        }
+        out[output] = inputScale;
+
+        return out;
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addScaleStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& scales,
+        const Data& output) {
+    return model->addNewStage<ScaleStage>(
+        name,
+        StageType::Scale,
+        layer,
+        {input, scales},
+        {output});
+}
+
+void FrontEnd::parseScale(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::ScaleShiftLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    if (layer->_broadcast != 0) {
+        VPU_THROW_EXCEPTION <<
+            "Layer " << layer->name << " doesn't support broadcast param";
+    }
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    Data scales, biases;
+    std::tie(scales, biases) = getWeightsAndBiases(model, layer);
+
+    if (biases->usage() == DataUsage::Fake) {
+        model->addNewStage<ScaleStage>(
+            layer->name,
+            StageType::Scale,
+            layer,
+            {input, scales},
+            {output});
+    } else {
+        model->addNewStage<ScaleStage>(
+            layer->name,
+            StageType::ScaleShift,
+            layer,
+            {input, scales, biases},
+            {output});
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/shrink.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/shrink.cpp
new file mode 100644 (file)
index 0000000..61582fc
--- /dev/null
@@ -0,0 +1,157 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <set>
+#include <unordered_set>
+#include <algorithm>
+
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+namespace {
+
+class ShrinkStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ShrinkStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto dimsOrder = input->desc().dimsOrder();
+
+        //
+        // Get smallest Dim over which Shrink is done.
+        //
+
+        auto minShrinkDimInd = dimsOrder.numDims();
+
+        for (const auto& p : input->desc().dims()) {
+            if (output->desc().dim(p.first) != p.second) {
+                minShrinkDimInd = std::min(minShrinkDimInd, dimsOrder.dimInd(p.first));
+            }
+        }
+
+        //
+        // Initial StridesRequirement for inputs and output.
+        //
+
+        auto inputReqs = input->requiredStrides();
+
+        auto outputReqs = inputReqs;
+
+        //
+        // Merge output consumers StridesRequirement.
+        //
+
+        for (const auto& consumer : output->consumers()) {
+            auto consumerInfo = consumer->getDataStridesRequirements();
+
+            auto consumerStrideIt = consumerInfo.find(output);
+            if (consumerStrideIt != consumerInfo.end()) {
+                auto consumerReqs = consumerStrideIt->second;
+
+                for (int i = 0; i < dimsOrder.numDims(); ++i) {
+                    if (inputReqs.get(i) == DimStride::Any) {
+                        if (consumerReqs.get(i) != DimStride::Any) {
+                            inputReqs.add(i, consumerReqs.get(i));
+                            outputReqs.add(i, consumerReqs.get(i));
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Remove extra output StridesRequirement.
+        //
+
+        for (int i = minShrinkDimInd + 1; i < dimsOrder.numDims(); ++i) {
+            outputReqs.remove(i);
+        }
+
+        //
+        // Return merged StridesRequirements.
+        //
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = inputReqs;
+        out[output] = outputReqs;
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+};
+
+}  // namespace
+
+Stage StageBuilder::addShrinkStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output,
+        const DimValues& offset) {
+    auto stage = model->addNewStage<ShrinkStage>(
+        name,
+        StageType::Shrink,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<DimValues>("offset", offset);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/sigmoid.cpp
new file mode 100644 (file)
index 0000000..b663a43
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class SigmoidStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<SigmoidStage>(*this);
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseSigmoid(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    model->addNewStage<SigmoidStage>(
+        layer->name,
+        StageType::Sigmoid,
+        layer,
+        inputs,
+        outputs);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/softmax.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/softmax.cpp
new file mode 100644 (file)
index 0000000..461bdd3
--- /dev/null
@@ -0,0 +1,138 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <set>
+#include <string>
+
+namespace vpu {
+
+namespace {
+
+class SoftMaxStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<SoftMaxStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>&,
+            ScalePropagationStep) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        out[input] = 1.0f;
+        out[output] = 1.0f;
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<DimsOrder> out;
+
+        out[output] = input->desc().dimsOrder();
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+
+        auto axis = attrs().get<Dim>("axis");
+        auto axisInd = input->desc().dimsOrder().dimInd(axis);
+
+        serializer.append(static_cast<int32_t>(axisInd));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseSoftMax(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto layer = std::dynamic_pointer_cast<ie::SoftMaxLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto layerInput = layer->insData[0].lock();
+    IE_ASSERT(layerInput != nullptr);
+
+    IE_ASSERT(layer->axis < input->desc().numDims());
+
+    auto perm = DimsOrder::fromNumDims(input->desc().numDims()).toPermutation();
+    auto axis = perm[input->desc().numDims() - 1 - layer->axis];
+
+    _stageBuilder->addSoftMaxStage(model, layer->name, layer, input, output, axis);
+}
+
+Stage StageBuilder::addSoftMaxStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output,
+        Dim axis) {
+    auto stage = model->addNewStage<SoftMaxStage>(
+        name,
+        StageType::SoftMax,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<Dim>("axis", axis);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/split.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/split.cpp
new file mode 100644 (file)
index 0000000..0c5e701
--- /dev/null
@@ -0,0 +1,301 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <algorithm>
+
+namespace vpu {
+
+namespace {
+
+class SplitStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<SplitStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(!_outputEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input);
+
+            for (const auto& outEdge : _outputEdges) {
+                out[outEdge->output()] = inputScale;
+            }
+        } else {
+            // Split can only propagate scaling.
+            out[input] = 1.0f;
+
+            for (const auto& outEdge : _outputEdges) {
+                out[outEdge->output()] = 1.0f;
+            }
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(!_outputEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+
+        DataMap<DimsOrder> out;
+
+        for (const auto& outEdge : _outputEdges) {
+            out[outEdge->output()] = input->desc().dimsOrder();
+        }
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(!_outputEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+
+        auto dimsOrder = input->desc().dimsOrder();
+
+        //
+        // Get smallest Dim over which Split is done.
+        //
+
+        auto minSplitDimInd = dimsOrder.numDims();
+
+        for (const auto& outEdge : _outputEdges) {
+            auto output = outEdge->output();
+
+            for (const auto& p : input->desc().dims()) {
+                if (output->desc().dim(p.first) != p.second) {
+                    minSplitDimInd = std::min(minSplitDimInd, dimsOrder.dimInd(p.first));
+                }
+            }
+        }
+
+        //
+        // Initial StridesRequirement for inputs and output.
+        //
+
+        auto inputReqs = input->requiredStrides();
+
+        auto outputReqs = inputReqs;
+
+        //
+        // Merge output consumers StridesRequirement.
+        //
+
+        for (const auto& outEdge : _outputEdges) {
+            auto curOutput = outEdge->output();
+
+            for (const auto& consumer : curOutput->consumers()) {
+                auto consumerInfo = consumer->getDataStridesRequirements();
+
+                auto consumerStrideIt = consumerInfo.find(curOutput);
+                if (consumerStrideIt != consumerInfo.end()) {
+                    auto consumerReqs = consumerStrideIt->second;
+
+                    for (int i = 0; i < dimsOrder.numDims(); ++i) {
+                        if (inputReqs.get(i) == DimStride::Any) {
+                            if (consumerReqs.get(i) != DimStride::Any) {
+                                inputReqs.add(i, consumerReqs.get(i));
+                                outputReqs.add(i, consumerReqs.get(i));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        //
+        // Remove extra output StridesRequirement.
+        //
+
+        for (int i = minSplitDimInd + 1; i < dimsOrder.numDims(); ++i) {
+            outputReqs.remove(i);
+        }
+
+        //
+        // Return merged StridesRequirements.
+        //
+
+        DataMap<StridesRequirement> out;
+
+        out[input] = inputReqs;
+        for (const auto& outEdge : _outputEdges) {
+            auto output = outEdge->output();
+            out[output] = outputReqs;
+        }
+
+        return out;
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+
+    void serializeDataImpl(BlobSerializer&) const override {
+        VPU_THROW_EXCEPTION << "Must never be called";
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseSplit(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(!outputs.empty());
+
+    auto layer = std::dynamic_pointer_cast<ie::SplitLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto input = inputs[0];
+
+    auto inDesc = input->desc();
+    auto perm = inDesc.dimsOrder().toPermutation();
+
+    // Check whether it is Split(copy) or Slice Caffe layer
+    // and we do not trust to IE layer type value.
+    bool isSplit = true;
+
+    for (const auto& output : outputs) {
+        for (int i = 0; i < perm.size(); ++i) {
+            if (inDesc.dim(perm[i]) != output->desc().dim(perm[i])) {
+                isSplit = false;
+                break;
+            }
+        }
+
+        if (!isSplit)
+            break;
+    }
+
+    if (isSplit) {
+        // Split is just a re-usage of the input Data.
+
+        for (int i = 0; i < outputs.size(); ++i) {
+            auto output = outputs[i];
+
+            IE_ASSERT(output->numConsumers() == 0);
+
+            if (output->usage() == DataUsage::Output) {
+                _stageBuilder->addCopyStage(
+                    model,
+                    formatString("%s@copy=%d/%d", layer->name, i + 1, outputs.size()),
+                    layer,
+                    input,
+                    output);
+            } else {
+                IE_ASSERT(output->usage() == DataUsage::Intermediate);
+
+                bindData(input, output->origData());
+            }
+        }
+    } else {
+        // Calculate target axis for slicing.
+
+        DimValues sumDims;
+        for (int i = 0; i < perm.size(); ++i) {
+            sumDims.set(perm[i], 0);
+        }
+        for (const auto& output : outputs) {
+            for (auto& p : sumDims) {
+                p.second += output->desc().dim(p.first);
+            }
+        }
+
+        auto axis = Dim::Invalid;
+        for (const auto& p : sumDims) {
+            if (inDesc.dim(p.first) == p.second) {
+                axis = p.first;
+                break;
+            }
+        }
+
+        IE_ASSERT(axis != Dim::Invalid);
+
+        for (int i = 0; i < perm.size(); ++i) {
+            if (perm[i] == axis) {
+                continue;
+            }
+
+            for (const auto& output : outputs) {
+                IE_ASSERT(inDesc.dim(perm[i]) == output->desc().dim(perm[i]));
+            }
+        }
+
+        _stageBuilder->addSplitStage(model, layer->name, layer, axis, input, outputs);
+    }
+}
+
+Stage StageBuilder::addSplitStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        Dim axis,
+        const Data& input,
+        const DataVector& outputs) {
+    std::vector<DimValues> offsets;
+
+    DimValues curOffset({{axis, 0}});
+    for (const auto& output : outputs) {
+        offsets.emplace_back(curOffset);
+        curOffset.set(axis, curOffset[axis] + output->desc().dim(axis));
+    }
+
+    auto stage = addSplitStage(model, name, layer, offsets, input, outputs);
+
+    stage->attrs().set("axis", axis);
+
+    return stage;
+}
+
+Stage StageBuilder::addSplitStage(
+        const Model::Ptr& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const std::vector<DimValues>& offsets,
+        const Data& input,
+        const DataVector& outputs) {
+    IE_ASSERT(offsets.size() == outputs.size());
+
+    auto stage = model->addNewStage<SplitStage>(
+        name,
+        StageType::Split,
+        layer,
+        {input},
+        outputs);
+
+    stage->attrs().set("offsets", offsets);
+
+    return stage;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/tanh.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/tanh.cpp
new file mode 100644 (file)
index 0000000..ebdf90d
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+
+#include <vpu/sw/post_op_stage.hpp>
+
+namespace vpu {
+
+namespace {
+
+class TanHStage final : public PostOpStage {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<TanHStage>(*this);
+    }
+
+    void serializeParamsImpl(BlobSerializer&) const override {
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseTanH(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    model->addNewStage<TanHStage>(
+        layer->name,
+        StageType::Tanh,
+        layer,
+        inputs,
+        outputs);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/tile.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/tile.cpp
new file mode 100644 (file)
index 0000000..21dc8eb
--- /dev/null
@@ -0,0 +1,145 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <memory>
+#include <set>
+#include <utility>
+
+namespace vpu {
+
+namespace {
+
+class TileStage final : public StageNode {
+protected:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<TileStage>(*this);
+    }
+
+    DataMap<float> propagateScaleFactorsImpl(
+            const DataMap<float>& inputScales,
+            ScalePropagationStep step) override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        DataMap<float> out;
+
+        if (step == ScalePropagationStep::Propagate) {
+            auto inputScale = inputScales.at(input);
+
+            out[output] = inputScale;
+        } else {
+            // Tile can only propagate scaling, not generate.
+
+            out[input] = 1.0f;
+            out[output] = 1.0f;
+        }
+
+        return out;
+    }
+
+    DataMap<DimsOrder> propagateDataOrderImpl() const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto inOrder = input->desc().dimsOrder();
+        auto finalOrder = inOrder;
+
+        DataMap<DimsOrder> out;
+
+        out[input] = finalOrder;
+        out[output] = finalOrder;
+
+        return out;
+    }
+
+    DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
+        return DataMap<StridesRequirement>();
+    }
+
+    DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
+        return DataMap<BatchSupport>();
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::OnlyOne;
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        auto axis = attrs().get<Dim>("axis");
+        auto tiles = attrs().get<int>("tiles");
+
+        auto axisInd = output->desc().dimsOrder().dimInd(axis);
+        IE_ASSERT(axisInd >= 0);
+
+        serializer.append(static_cast<int32_t>(axisInd));
+        serializer.append(static_cast<int32_t>(tiles));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+        IE_ASSERT(_tempBufferEdges.empty());
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        input->serializeNewBuffer(serializer);
+        output->serializeNewBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseTile(
+        const Model::Ptr& model,
+        const ie::CNNLayerPtr& _layer,
+        const DataVector& inputs,
+        const DataVector& outputs) {
+    IE_ASSERT(inputs.size() == 1);
+    IE_ASSERT(outputs.size() == 1);
+
+    auto layer = std::dynamic_pointer_cast<ie::TileLayer>(_layer);
+    IE_ASSERT(layer != nullptr);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    IE_ASSERT(layer->axis < input->desc().numDims());
+
+    auto perm = DimsOrder::fromNumDims(input->desc().numDims()).toPermutation();
+    auto axis = perm[input->desc().numDims() - 1 - layer->axis];
+
+    auto stage = model->addNewStage<TileStage>(
+        layer->name,
+        StageType::Tile,
+        layer,
+        {input},
+        {output});
+
+    stage->attrs().set<Dim>("axis", axis);
+    stage->attrs().set<int>("tiles", layer->tiles);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stub_stage.cpp b/inference-engine/src/vpu/graph_transformer/src/stub_stage.cpp
new file mode 100644 (file)
index 0000000..cab0d24
--- /dev/null
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/stub_stage.hpp>
+
+#include <memory>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+StagePtr StubStage::cloneImpl() const {
+    return std::make_shared<StubStage>(*this);
+}
+
+DataMap<float> StubStage::propagateScaleFactorsImpl(
+        const DataMap<float>& inputScales,
+        ScalePropagationStep step) {
+    DataMap<float> out;
+
+    if (_type == StageType::StubConv ||
+        _type == StageType::StubFullyConnected ||
+        _type == StageType::StubDeconv) {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+
+        auto inputScale = inputScales.at(input);
+
+        out[weights] = step == ScalePropagationStep::Propagate ? 1.0f : inputScale;
+        if (biases->usage() == DataUsage::Const) {
+            out[biases] = inputScale;
+        }
+        out[output] = inputScale;
+    } else {
+        IE_ASSERT(_type == StageType::StubMaxPool || _type == StageType::StubAvgPool);
+
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto output = _outputEdges[0]->output();
+
+        out[output] = inputScales.at(input);
+    }
+
+    return out;
+}
+
+DataMap<DimsOrder> StubStage::propagateDataOrderImpl() const {
+    VPU_THROW_EXCEPTION << "Must be replaced with real stage";
+}
+
+DataMap<StridesRequirement> StubStage::getDataStridesRequirementsImpl() const {
+    VPU_THROW_EXCEPTION << "Must be replaced with real stage";
+}
+
+void StubStage::finalizeDataLayoutImpl() {
+    VPU_THROW_EXCEPTION << "Must be replaced with real stage";
+}
+
+DataMap<BatchSupport> StubStage::getBatchSupportInfoImpl() const {
+    DataMap<BatchSupport> out;
+
+    if (_type == StageType::StubConv ||
+        _type == StageType::StubFullyConnected ||
+        _type == StageType::StubDeconv) {
+        IE_ASSERT(_inputEdges.size() == 3);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        auto input = _inputEdges[0]->input();
+        auto weights = _inputEdges[1]->input();
+        auto biases = _inputEdges[2]->input();
+        auto output = _outputEdges[0]->output();
+
+        IE_ASSERT(weights->usage() == DataUsage::Const);
+        IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
+
+        out[input] = BatchSupport::Split;
+        out[output] = BatchSupport::Split;
+    } else {
+        IE_ASSERT(_type == StageType::StubMaxPool || _type == StageType::StubAvgPool);
+
+        IE_ASSERT(_inputEdges.size() == 1);
+        IE_ASSERT(_outputEdges.size() == 1);
+
+        // Pooling will support batch by merging it with previous dimension.
+    }
+
+    return out;
+}
+
+void StubStage::finalCheckImpl() const {
+    VPU_THROW_EXCEPTION << "Must never be called";
+}
+
+void StubStage::serializeParamsImpl(BlobSerializer&) const {
+    VPU_THROW_EXCEPTION << "Must never be called";
+}
+
+void StubStage::serializeDataImpl(BlobSerializer&) const {
+    VPU_THROW_EXCEPTION << "Must never be called";
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/sw/post_op_stage.cpp b/inference-engine/src/vpu/graph_transformer/src/sw/post_op_stage.cpp
new file mode 100644 (file)
index 0000000..8bafaa0
--- /dev/null
@@ -0,0 +1,196 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/sw/post_op_stage.hpp>
+
+#include <memory>
+
+#include <vpu/model/edges.hpp>
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+DataMap<float> PostOpStage::propagateScaleFactorsImpl(
+        const DataMap<float>&,
+        ScalePropagationStep) {
+    IE_ASSERT(!_inputEdges.empty());
+    IE_ASSERT(_outputEdges.size() == 1);
+
+    auto output = _outputEdges[0]->output();
+
+    DataMap<float> out;
+
+    // By default, assume no scale propagation.
+    for (const auto& inEdge : _inputEdges) {
+        out[inEdge->input()] = 1.0f;
+    }
+    out[output] = 1.0f;
+
+    return out;
+}
+
+DataMap<DimsOrder> PostOpStage::propagateDataOrderImpl() const {
+    IE_ASSERT(!_inputEdges.empty());
+    IE_ASSERT(_outputEdges.size() == 1);
+
+    // Non-zero-port inputs are constant (scales/biases).
+    for (const auto& inEdge : _inputEdges) {
+        if (inEdge->portInd() > 0) {
+            IE_ASSERT(inEdge->input()->usage() == DataUsage::Const);
+        }
+    }
+
+    auto input = _inputEdges[0]->input();
+    auto output = _outputEdges[0]->output();
+
+    DataMap<DimsOrder> out;
+
+    auto inDimsOrder = input->desc().dimsOrder();
+
+    // TODO: support HCW on firmware side
+    if (inDimsOrder.dimInd(Dim::C) == 1) {
+        inDimsOrder = inDimsOrder.createMovedDim(Dim::C, 2);  // CHW
+        out[input] = inDimsOrder;
+    }
+
+    out[output] = inDimsOrder;
+
+    return out;
+}
+
+DataMap<StridesRequirement> PostOpStage::getDataStridesRequirementsImpl() const {
+    IE_ASSERT(!_inputEdges.empty());
+    IE_ASSERT(_outputEdges.size() == 1);
+
+    // Non-zero-port inputs are constant (scales/biases).
+    for (const auto& inEdge : _inputEdges) {
+        if (inEdge->portInd() > 0) {
+            IE_ASSERT(inEdge->input()->usage() == DataUsage::Const);
+        }
+    }
+
+    auto input = _inputEdges[0]->input();
+    auto output = _outputEdges[0]->output();
+
+    DataMap<StridesRequirement> out;
+
+    StridesRequirement reqs;
+
+    // Current PostOp implementation requires Compact major stride.
+    reqs.add(2, DimStride::Compact);
+
+    if (input->desc().dim(Dim::N, 1) > 1) {
+        // To merge batch into previous dimension.
+        reqs.add(input->desc().dimsOrder().dimInd(Dim::N), DimStride::Compact);
+    }
+
+    out[input] = reqs;
+    out[output] = reqs;
+
+    return out;
+}
+
+void PostOpStage::finalizeDataLayoutImpl() {
+}
+
+DataMap<BatchSupport> PostOpStage::getBatchSupportInfoImpl() const {
+    IE_ASSERT(!_inputEdges.empty());
+    IE_ASSERT(_outputEdges.size() == 1);
+
+    // Non-zero-port inputs are constant (scales/biases).
+    for (const auto& inEdge : _inputEdges) {
+        if (inEdge->portInd() > 0) {
+            IE_ASSERT(inEdge->input()->usage() == DataUsage::Const);
+        }
+    }
+
+    auto mainDesc = _inputEdges[0]->input()->desc();
+
+    DataMap<BatchSupport> out;
+
+    // PostOp will support batch by merging it with previous dimension.
+    for (const auto& inEdge : _inputEdges) {
+        auto input = inEdge->input();
+
+        if (inEdge->portInd() == 0)
+            continue;
+
+        if (input->desc().dimsOrder().dimInd(Dim::C) == input->desc().numDims() - 2) {
+            IE_ASSERT(input->desc().totalDimSize() == input->desc().dim(Dim::C));
+            out[input] = BatchSupport::ReplicateConstContent;
+        }
+    }
+
+    return out;
+}
+
+StageSHAVEsRequirements PostOpStage::getSHAVEsRequirementsImpl() const {
+    // TODO: more SHAVEs leads to hang on public MTCNN network with U8 input
+    return StageSHAVEsRequirements::TwoOrOne;
+}
+
+void PostOpStage::finalCheckImpl() const {
+}
+
+void PostOpStage::serializeDataImpl(BlobSerializer& serializer) const {
+    IE_ASSERT(!_inputEdges.empty());
+    IE_ASSERT(_outputEdges.size() == 1);
+    IE_ASSERT(_tempBufferEdges.empty());
+
+    auto input = _inputEdges[0]->input();
+    auto output = _outputEdges[0]->output();
+
+    if (input->desc().dimsOrder() == DimsOrder::NC) {
+        input->serializeOldBuffer(
+            handle_from_this(),
+            serializer,
+            DimsOrder::HWC,
+            {
+                {Dim::W, {Dim::N}},
+                {Dim::C, {Dim::C}}
+            });
+
+        output->serializeOldBuffer(
+            handle_from_this(),
+            serializer,
+            DimsOrder::HWC,
+            {
+                {Dim::W, {Dim::N}},
+                {Dim::C, {Dim::C}}
+            });
+    } else if (input->desc().dim(Dim::N, 1) > 1) {
+        auto perm = input->desc().dimsOrder().toPermutation();
+        IE_ASSERT(perm.size() == 4);
+
+        input->serializeOldBuffer(
+            handle_from_this(),
+            serializer,
+            DimsOrder::HWC,
+            {
+                {Dim::H, {perm[2], perm[3]}},
+                {Dim::W, {perm[1]}},
+                {Dim::C, {perm[0]}}
+            });
+
+        output->serializeOldBuffer(
+            handle_from_this(),
+            serializer,
+            DimsOrder::HWC,
+            {
+                {Dim::H, {perm[2], perm[3]}},
+                {Dim::W, {perm[1]}},
+                {Dim::C, {perm[0]}}
+            });
+    } else {
+        input->serializeOldBuffer(handle_from_this(), serializer);
+
+        output->serializeOldBuffer(handle_from_this(), serializer);
+    }
+
+    for (int i = 1; i < _inputEdges.size(); ++i) {
+        _inputEdges[i]->input()->serializeOldBuffer(handle_from_this(), serializer);
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/sw/utility.cpp b/inference-engine/src/vpu/graph_transformer/src/sw/utility.cpp
new file mode 100644 (file)
index 0000000..537435c
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/sw/utility.hpp>
+
+#include <memory>
+#include <unordered_set>
+
+#include <vpu/model/model.hpp>
+
+namespace vpu {
+
+//
+// DefaultSwWeightsContent
+//
+
+DefaultSwWeightsContent::DefaultSwWeightsContent(const DataContent::Ptr& origContent) :
+        CalculatedDataContent({origContent}) {
+}
+
+void DefaultSwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
+    VPU_PROFILE(DefaultSwWeightsContent);
+
+    IE_ASSERT(_desc.type() == DataType::FP16);
+    IE_ASSERT(baseContents.size() == 1);
+
+    kchw_to_hwck(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// getNextStage
+//
+
+Stage getNextStage(
+        const Stage& curStage,
+        const std::unordered_set<StageType, EnumClassHash>& supportedTypes) {
+    IE_ASSERT(curStage->numOutputs() == 1);
+
+    auto output = curStage->output(0);
+
+    IE_ASSERT(output->parentData() == nullptr);
+    IE_ASSERT(output->numChildDatas() == 0);
+
+    if (output->usage() != DataUsage::Intermediate) {
+        return nullptr;
+    }
+
+    if (output->numConsumers() != 1) {
+        return nullptr;
+    }
+
+    auto consumer = output->singleConsumer();
+    if (supportedTypes.count(consumer->type()) != 0) {
+        return consumer;
+    }
+
+    return nullptr;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/dot_io.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/dot_io.cpp
new file mode 100644 (file)
index 0000000..fbfb977
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/dot_io.hpp>
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <precision_utils.h>
+
+#include <vpu/utils/any.hpp>
+#include <vpu/utils/attributes_map.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+DotLabel::DotLabel(const std::string& caption, DotSerializer& out) : _out(out) {
+    _ostr << "label=\"" << caption << "\\l";
+}
+
+DotLabel::DotLabel(DotLabel& other) : _out(other._out), _parent(&other), _ident(other._ident) {
+    ++_ident;
+    _ostr << "[\\l";
+}
+
+DotLabel::~DotLabel() {
+    if (_parent == nullptr) {
+        _ostr << "\"";
+
+        try {
+            _out.append("%s", _ostr.str());
+        }
+        catch (...) {
+            std::cerr << "ERROR ~DotLabel(): can not append symbols\n";
+        }
+
+    } else {
+        --_ident;
+
+        for (size_t i = 0; i < _ident; ++i)
+            _ostr << "    ";
+
+        _ostr << "]";
+
+        _parent->_ostr << _ostr.str();
+    }
+}
+
+void DotLabel::addIdent() {
+    for (size_t i = 0; i < _ident; ++i)
+        _ostr << "    ";
+}
+
+void printTo(DotLabel& lbl, const Any& any) {
+    any.printImpl(lbl);
+}
+
+void printTo(DotLabel& lbl, const AttributesMap& attrs) {
+    attrs.printImpl(lbl);
+}
+
+void printTo(DotLabel& lbl, const ie::DataPtr& ieData) {
+    IE_ASSERT(ieData != nullptr);
+
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("name", ieData->getName());
+    subLbl.appendPair("precision", ieData->getTensorDesc().getPrecision().name());
+    subLbl.appendPair("dims", ieData->getTensorDesc().getDims());
+    subLbl.appendPair("layout", ieData->getTensorDesc().getLayout());
+}
+
+void printTo(DotLabel& lbl, const ie::Blob::Ptr& ieBlob) {
+    IE_ASSERT(ieBlob != nullptr);
+
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("precision", ieBlob->getTensorDesc().getPrecision().name());
+    subLbl.appendPair("dims", ieBlob->getTensorDesc().getDims());
+    subLbl.appendPair("layout", ieBlob->getTensorDesc().getLayout());
+
+    if (ieBlob->getTensorDesc().getPrecision() == ie::Precision::FP32) {
+        auto contentPtr = ieBlob->cbuffer().as<const uint8_t*>();
+        auto count = ieBlob->size();
+
+        std::vector<uint8_t> temp(
+            contentPtr,
+            contentPtr + std::min<int>(count, 8));
+
+        subLbl.appendPair("content", temp);
+    } else if (ieBlob->getTensorDesc().getPrecision() == ie::Precision::FP16) {
+        auto contentPtr = ieBlob->cbuffer().as<const fp16_t*>();
+        auto count = ieBlob->size();
+
+        std::vector<float> temp(std::min<int>(count, 8));
+        ie::PrecisionUtils::f16tof32Arrays(temp.data(), contentPtr, temp.size());
+
+        lbl.appendPair("content", temp);
+    }
+}
+
+void printTo(DotLabel& lbl, const ie::CNNLayerPtr& ieLayer) {
+    IE_ASSERT(ieLayer != nullptr);
+
+    DotLabel subLbl(lbl);
+    subLbl.appendPair("name", ieLayer->name);
+    subLbl.appendPair("type", ieLayer->type);
+    subLbl.appendPair("precision", ieLayer->precision.name());
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/enums.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/enums.cpp
new file mode 100644 (file)
index 0000000..1cbc358
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/enums.hpp>
+
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+#include <vpu/utils/string.hpp>
+
+namespace vpu {
+
+namespace {
+
+void removeCharFromString(std::string& str, char ch) {
+    str.erase(std::remove(str.begin(), str.end(), ch), str.end());
+}
+
+}  // namespace
+
+std::unordered_map<int32_t, std::string> generateEnumMap(const std::string& strMap) {
+    std::unordered_map<int32_t, std::string> retMap;
+
+    std::string strMapCopy = strMap;
+
+    removeCharFromString(strMapCopy, ' ');
+    removeCharFromString(strMapCopy, '(');
+
+    std::vector<std::string> enumTokens;
+    splitStringList(strMapCopy, enumTokens, ',');
+
+    int32_t inxMap = 0;
+    for (const auto& token : enumTokens) {
+        // Token: [EnumName | EnumName=EnumValue]
+        std::string enumName;
+        if (token.find('=') == std::string::npos) {
+            enumName = token;
+        } else {
+            std::vector<std::string> enumNameValue;
+            splitStringList(token, enumNameValue, '=');
+            IE_ASSERT(enumNameValue.size() == 2);
+
+            enumName = enumNameValue[0];
+            inxMap = std::stoi(enumNameValue[1], nullptr, 0);
+        }
+
+        retMap[inxMap] = enumName;
+
+        ++inxMap;
+    }
+
+    return retMap;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/file_system.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/file_system.cpp
new file mode 100644 (file)
index 0000000..e50b4fb
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/file_system.hpp>
+
+#include <string>
+
+namespace vpu {
+
+std::string fileNameNoExt(const std::string& filePath) {
+    auto pos = filePath.rfind('.');
+    if (pos == std::string::npos)
+        return filePath;
+    return filePath.substr(0, pos);
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/ie_helpers.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/ie_helpers.cpp
new file mode 100644 (file)
index 0000000..5ee43c3
--- /dev/null
@@ -0,0 +1,86 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/ie_helpers.hpp>
+
+#include <precision_utils.h>
+#include <details/ie_exception.hpp>
+#include <blob_transform.hpp>
+#include <blob_factory.hpp>
+
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/numeric.hpp>
+
+namespace vpu {
+
+ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in) {
+    VPU_PROFILE(getBlobFP16);
+
+    auto inDesc = in->getTensorDesc();
+
+    auto precision = inDesc.getPrecision();
+
+    if (precision == ie::Precision::FP16)
+        return in;
+
+    if (precision != ie::Precision::FP32) {
+        VPU_THROW_EXCEPTION << "Unsupported precision " << precision.name();
+    }
+
+    // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
+    ie::TensorDesc outDesc(inDesc.getPrecision(), inDesc.getDims(), inDesc.getLayout());
+    auto out = make_blob_with_precision(outDesc);
+    out->allocate();
+
+    ie::PrecisionUtils::f32tof16Arrays(out->buffer().as<fp16_t*>(), in->cbuffer().as<float*>(), in->size());
+
+    return out;
+}
+
+ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in) {
+    return copyBlob(in, in->getTensorDesc().getLayout());
+}
+
+ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in, ie::Layout outLayout) {
+    auto inDesc = in->getTensorDesc();
+
+    // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
+    ie::TensorDesc outDesc(inDesc.getPrecision(), inDesc.getDims(), outLayout);
+    auto out = make_blob_with_precision(outDesc);
+    out->allocate();
+
+    copyBlob(in, out);
+
+    return out;
+}
+
+void copyBlob(const ie::Blob::Ptr& in, const ie::Blob::Ptr& out) {
+    auto inLayout = in->getTensorDesc().getLayout();
+    auto outLayout = out->getTensorDesc().getLayout();
+
+    if (inLayout != outLayout) {
+        IE_ASSERT(inLayout == ie::Layout::NCHW || inLayout == ie::Layout::NHWC);
+        IE_ASSERT(outLayout == ie::Layout::NCHW || outLayout == ie::Layout::NHWC);
+
+        const auto& dims = out->getTensorDesc().getDims();
+
+        if ((dims[0] != 1 || dims[1] != 1) && (dims[2] != 1 || dims[3] != 1)) {
+            ie::blob_copy(in, out);
+            return;
+        }
+    }
+
+    auto inPtr = in->cbuffer().as<uint8_t *>();
+    IE_ASSERT(inPtr != nullptr);
+
+    auto outPtr = out->cbuffer().as<uint8_t *>();
+    IE_ASSERT(outPtr != nullptr);
+
+    std::copy_n(
+        in->cbuffer().as<uint8_t *>(),
+        in->byteSize(),
+        out->buffer().as<uint8_t *>());
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/io.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/io.cpp
new file mode 100644 (file)
index 0000000..e1d0189
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/io.hpp>
+
+#include <iostream>
+
+#include <vpu/utils/any.hpp>
+#include <vpu/utils/attributes_map.hpp>
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+void printTo(std::ostream& os, const Any& any) noexcept {
+    any.printImpl(os);
+}
+
+void printTo(std::ostream& os, const AttributesMap& attrs) noexcept {
+    attrs.printImpl(os);
+}
+
+void formatPrint(std::ostream& os, const char* str) noexcept {
+    try {
+        while (*str) {
+            if (*str == '%') {
+                if (*(str + 1) == '%') {
+                    ++str;
+                } else {
+                    throw std::invalid_argument("[VPU] Invalid format string : missing arguments");
+                }
+            }
+
+            os << *str++;
+        }
+    } catch (std::invalid_argument e) {
+        std::cerr << e.what() << '\n';
+        std::abort();
+    } catch (...) {
+        std::cerr << "[VPU] Unknown error in formatPrint\n";
+        std::abort();
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/logger.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/logger.cpp
new file mode 100644 (file)
index 0000000..7c79c72
--- /dev/null
@@ -0,0 +1,126 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/logger.hpp>
+
+#include <mutex>
+#include <string>
+#include <fstream>
+#include <iomanip>
+#include <memory>
+
+namespace vpu {
+
+//
+// OutputStream
+//
+
+namespace {
+
+class ConsoleOutput final : public OutputStream {
+public:
+    std::ostream& get() override { return std::cout; }
+
+    bool supportColors() const override {
+#ifdef _WIN32
+        // TODO: check if Windows supports colors in terminal
+        return false;
+#else
+        return true;
+#endif
+    }
+
+    void lock() override { _mtx.lock(); }
+    void unlock() override { _mtx.unlock(); }
+
+private:
+    std::mutex _mtx;
+};
+
+class FileOutput final : public OutputStream {
+public:
+    explicit FileOutput(const std::string& fileName) : _file(fileName) {
+        if (!_file.is_open()) {
+            std::cerr << "Failed to open LOG file\n";
+            std::abort();
+        }
+    }
+
+    std::ostream& get() override { return _file; }
+
+    bool supportColors() const override { return false; }
+
+    void lock() override { _mtx.lock(); }
+
+    void unlock() override { _mtx.unlock(); }
+
+private:
+    std::ofstream _file;
+    std::mutex _mtx;
+};
+
+}  // namespace
+
+OutputStream::Ptr consoleOutput() {
+    static auto obj = std::make_shared<ConsoleOutput>();
+    return obj;
+}
+
+OutputStream::Ptr fileOutput(const std::string& fileName) {
+    return std::make_shared<FileOutput>(fileName);
+}
+
+//
+// Logger
+//
+
+namespace {
+
+const auto COLOR_RED = "\033[1;31m";
+const auto COLOR_GRN = "\033[1;32m";
+const auto COLOR_YEL = "\033[1;33m";
+const auto COLOR_BLU = "\033[1;34m";
+const auto COLOR_END = "\033[0m";
+
+}  // namespace
+
+void Logger::printHeader(LogLevel msgLevel) const noexcept {
+    try {
+        if (_out->supportColors()) {
+            static const EnumMap<LogLevel, const char *> levelColors{
+                    {LogLevel::Error,   COLOR_RED},
+                    {LogLevel::Warning, COLOR_YEL},
+                    {LogLevel::Info,    COLOR_GRN},
+                    {LogLevel::Debug,   COLOR_BLU},
+            };
+
+            _out->get() << levelColors.at(msgLevel);
+        }
+
+        _out->get() << "[" << std::setw(7) << std::left << msgLevel << "]";
+        _out->get() << "[VPU]";
+        _out->get() << "[" << _name << "] ";
+
+        for (size_t i = 0; i < _ident; ++i) {
+            _out->get() << "    ";
+        }
+    } catch (...) {
+        std::cerr << "[VPU] Cannot print header\n";
+        std::abort();
+    }
+}
+
+void Logger::printFooter() const noexcept {
+    try {
+        if (_out->supportColors()) {
+            _out->get() << COLOR_END;
+        }
+        _out->get() << std::endl;
+    } catch (...) {
+        std::cerr << "[VPU] Cannot print footer\n";
+        std::abort();
+    }
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/perf_report.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/perf_report.cpp
new file mode 100644 (file)
index 0000000..5d5d2f5
--- /dev/null
@@ -0,0 +1,79 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/perf_report.hpp>
+
+#include <vector>
+#include <string>
+#include <map>
+
+namespace vpu {
+
+std::map<std::string, ie::InferenceEngineProfileInfo> parsePerformanceReport(
+        const std::vector<StageMetaInfo>& stagesMeta,
+        const float* deviceTimings,
+        int deviceTimingsCount,
+        PerfReport perfReport,
+        bool printReceiveTensorTime) {
+    IE_ASSERT(deviceTimings != nullptr);
+    IE_ASSERT(deviceTimingsCount > 0);
+
+    std::map<std::string, ie::InferenceEngineProfileInfo> outPerfMap;
+
+    int timeIndex = 0;
+    int execIndex = 1;
+
+    for (const auto& stageMeta : stagesMeta) {
+        float timeMS = 0;
+        if (stageMeta.status == ie::InferenceEngineProfileInfo::EXECUTED &&
+            timeIndex < deviceTimingsCount) {
+            timeMS = deviceTimings[timeIndex];
+            timeIndex++;
+        }
+
+        if (stageMeta.stageType == "<Receive-Tensor>" &&
+            !printReceiveTensorTime) {
+            continue;
+        }
+
+        ie::InferenceEngineProfileInfo profInfo = {};
+
+        profInfo.status = stageMeta.status;
+
+        profInfo.cpu_uSec = 0;
+        profInfo.realTime_uSec = static_cast<long long int>(timeMS * 1000);
+
+        stageMeta.layerType.copy(profInfo.layer_type, sizeof(profInfo.layer_type) / sizeof(profInfo.layer_type[0]), 0);
+        stageMeta.stageType.copy(profInfo.exec_type, sizeof(profInfo.exec_type) / sizeof(profInfo.exec_type[0]), 0);
+
+        if (stageMeta.stageType == "<Receive-Tensor>") {
+            profInfo.execution_index = 0;
+        } else if (stageMeta.status == ie::InferenceEngineProfileInfo::EXECUTED) {
+            profInfo.execution_index = execIndex;
+            execIndex++;
+        }
+
+        if (perfReport == PerfReport::PerStage) {
+            outPerfMap[stageMeta.stageName] = profInfo;
+        } else if (perfReport == PerfReport::PerLayer) {
+            auto it = outPerfMap.find(stageMeta.layerName);
+            if (it == outPerfMap.end()) {
+                outPerfMap[stageMeta.layerName] = profInfo;
+            } else {
+                auto& prevProfInfo = it->second;
+
+                if (profInfo.status == ie::InferenceEngineProfileInfo::EXECUTED) {
+                    prevProfInfo.status = ie::InferenceEngineProfileInfo::EXECUTED;
+                }
+
+                prevProfInfo.cpu_uSec += profInfo.cpu_uSec;
+                prevProfInfo.realTime_uSec += profInfo.realTime_uSec;
+            }
+        }
+    }
+
+    return outPerfMap;
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/utils/simple_math.cpp b/inference-engine/src/vpu/graph_transformer/src/utils/simple_math.cpp
new file mode 100644 (file)
index 0000000..650ed57
--- /dev/null
@@ -0,0 +1,181 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/utils/simple_math.hpp>
+
+#include <cctype>
+
+#include <string>
+#include <set>
+#include <stack>
+#include <map>
+#include <stdexcept>
+#include <utility>
+#include <functional>
+
+#include <vpu/utils/extra.hpp>
+
+namespace vpu {
+
+namespace {
+
+const std::set<char> whitespaces = {
+    ' ',
+    '\t',
+};
+
+// priority, function
+using Operator = std::pair<int, std::function<int(int, int)>>;
+
+const std::map<char, Operator> operators = {
+    { '+', { 0, std::plus<int>() } },
+    { '-', { 0, std::minus<int>() } },
+    { '*', { 1, std::multiplies<int>() } },
+    { '/', { 1, std::divides<int>()  } },
+    { '%', { 1, std::modulus<int>()  } },
+};
+
+}  // namespace
+
+void SimpleMathExpression::parse(const std::string& expression) {
+    _parsedTokens.clear();
+
+    std::stack<char> operatorStack;
+
+    // While there are tokens to be read.
+    for (size_t i = 0; i != expression.length(); i++) {
+        // Ignore whitespaces;
+        while (whitespaces.find(expression[i]) != whitespaces.end()) {
+            i++;
+        }
+
+        // Read a token.
+        auto curr = expression[i];
+
+        // If the token is a number, then push it to the output queue.
+        if (std::isdigit(curr)) {
+            size_t len = 0;
+            auto value = std::stoi(expression.substr(i), &len);
+
+            _parsedTokens.emplace_back(Token(Token::Value, value, 0));
+
+            i += (len - 1);
+
+            continue;
+        }
+
+        // If the token is a variable, then push it's value to the output queue.
+        if (_vars.find(curr) != _vars.end()) {
+            _parsedTokens.emplace_back(Token(Token::Value, _vars.at(curr), 0));
+
+            continue;
+        }
+
+        // If the token is an operator, then:
+        if (operators.find(curr) != operators.end()) {
+            // While there is an operator at the top of the operator stack with
+            //   greater than or equal to precedence:
+            //     pop operators from the operator stack, onto the output queue;
+            while (!operatorStack.empty() &&
+                   (operators.find(operatorStack.top()) != operators.end()) &&
+                   (operators.at(operatorStack.top()).first >= operators.at(curr).first)) {
+                auto op = operatorStack.top();
+                operatorStack.pop();
+
+                _parsedTokens.emplace_back(Token(Token::Operator, 0, op));
+            }
+
+            //     push the read operator onto the operator stack.
+            operatorStack.push(curr);
+
+            continue;
+        }
+
+        // If the token is a left bracket (i.e. "("), then:
+        //   push it onto the operator stack.
+        if (curr == '(') {
+            operatorStack.push(curr);
+
+            continue;
+        }
+
+        // If the token is a right bracket (i.e. ")"), then:
+        if (curr == ')') {
+            // While the operator at the top of the operator stack is not a left bracket:
+            //   pop operators from the operator stack onto the output queue;
+            while (!operatorStack.empty() &&
+                   operatorStack.top() != '(') {
+                _parsedTokens.emplace_back(Token(Token::Operator, 0, operatorStack.top()));
+
+                operatorStack.pop();
+            }
+
+            //   pop the left bracket from the stack.
+            // If the stack runs out without finding a left bracket, then there are mismatched parentheses.
+            if (!operatorStack.empty() &&
+                operatorStack.top() == '(') {
+                operatorStack.pop();
+            } else {
+                VPU_THROW_EXCEPTION << "Mismatched parentheses in " << expression;
+            }
+
+            continue;
+        }
+
+        // Unknown token
+        VPU_THROW_EXCEPTION << "Unknown token " << curr << " in " << expression;
+    }
+
+    // If there are no more tokens to read:
+    //   while there are still operator tokens on the stack:
+    //     if the operator token on the top of the stack is a bracket, then
+    //       there are mismatched parentheses;
+    //     pop the operator onto the output queue.
+    while (!operatorStack.empty()) {
+        if (operatorStack.top() == '(') {
+            VPU_THROW_EXCEPTION << "Mismatched parentheses in " << expression;
+        }
+
+        _parsedTokens.emplace_back(Token(Token::Operator, 0, operatorStack.top()));
+
+        operatorStack.pop();
+    }
+}
+
+int SimpleMathExpression::evaluate() const {
+    std::stack<int> values;
+    for (const auto& t : _parsedTokens) {
+        switch (t.type) {
+        case Token::Value:
+            values.push(t.value);
+            break;
+        case Token::Operator: {
+            if (values.size() < 2) {
+                VPU_THROW_EXCEPTION << "Illegal expression: not enough values for operator evaluation";
+            }
+
+            // pop last 2 values and apply operand
+            auto val2 = values.top();
+            values.pop();
+
+            auto val1 = values.top();
+            values.pop();
+
+            values.push(operators.at(t.op).second(val1, val2));
+
+            break;
+        }
+        default:
+            VPU_THROW_EXCEPTION << "Illegal expression: unhandled token";
+        }
+    }
+
+    if (values.size() != 1) {
+        VPU_THROW_EXCEPTION << "Illegal expression: not enough operators";
+    }
+
+    return values.top();
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/CMakeLists.txt b/inference-engine/src/vpu/myriad_plugin/CMakeLists.txt
new file mode 100644 (file)
index 0000000..3ddfd97
--- /dev/null
@@ -0,0 +1,41 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "myriadPlugin")
+
+file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+
+addVersionDefines(myriad_plugin.cpp CI_BUILD_NUMBER)
+
+add_library(${TARGET_NAME} SHARED ${SOURCES})
+
+target_include_directories(${TARGET_NAME}
+    PRIVATE
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+    SYSTEM PRIVATE
+        "${IE_MAIN_SOURCE_DIR}/include"
+        "${IE_MAIN_SOURCE_DIR}/src/inference_engine")
+
+target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(${TARGET_NAME}
+        PRIVATE
+            -Wall)
+endif()
+
+if (ENABLE_MYX_PCIE)
+    target_compile_definitions(${TARGET_NAME} PRIVATE USE_PCIE)
+elseif (ENABLE_MYRIAD_NO_BOOT)
+    target_compile_definitions(${TARGET_NAME} PRIVATE NO_BOOT)
+endif()
+
+target_link_libraries(${TARGET_NAME}
+    PRIVATE
+        ${INTEL_ITT_LIBS} inference_engine vpu_graph_transformer mvnc)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    COMPILE_PDB_NAME ${TARGET_NAME})
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.cpp
new file mode 100644 (file)
index 0000000..7a1733e
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+#include "myriad_async_infer_request.h"
+
+using namespace vpu::MyriadPlugin;
+using namespace InferenceEngine;
+
+MyriadAsyncInferRequest::MyriadAsyncInferRequest(MyriadInferRequest::Ptr request,
+                                                 const InferenceEngine::ITaskExecutor::Ptr &taskExecutorStart,
+                                                 const InferenceEngine::ITaskExecutor::Ptr &taskExecutorGetResult,
+                                                 const InferenceEngine::TaskSynchronizer::Ptr &taskSynchronizer,
+                                                 const InferenceEngine::ITaskExecutor::Ptr &callbackExecutor)
+        : InferenceEngine::AsyncInferRequestThreadSafeDefault(request,
+                                                              taskExecutorStart,
+                                                              taskSynchronizer,
+                                                              callbackExecutor),
+          _request(request), _taskExecutorGetResult(taskExecutorGetResult) {}
+
+
+InferenceEngine::StagedTask::Ptr MyriadAsyncInferRequest::createAsyncRequestTask() {
+    return std::make_shared<StagedTask>([this]() {
+        auto asyncTaskCopy = _asyncTask;
+        try {
+            switch (asyncTaskCopy->getStage()) {
+                case 3: {
+                    _request->InferAsync();
+                    asyncTaskCopy->stageDone();
+                    _taskExecutorGetResult->startTask(asyncTaskCopy);
+                }
+                    break;
+                case 2: {
+                    _request->GetResult();
+                    asyncTaskCopy->stageDone();
+                    if (_callbackManager.isCallbackEnabled()) {
+                        _callbackManager.startTask(asyncTaskCopy);
+                    } else {
+                        asyncTaskCopy->stageDone();
+                    }
+                }
+                    break;
+                case 1: {
+                    setIsRequestBusy(false);
+                    asyncTaskCopy->stageDone();
+                    _callbackManager.runCallback();
+                }
+                    break;
+                default:
+                    break;
+            }
+        } catch (...) {
+            processAsyncTaskFailure(asyncTaskCopy);
+        }
+    }, 3);
+}
+
+MyriadAsyncInferRequest::~MyriadAsyncInferRequest() {
+    waitAllAsyncTasks();
+}
+
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.h b/inference-engine/src/vpu/myriad_plugin/myriad_async_infer_request.h
new file mode 100644 (file)
index 0000000..b2ef554
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp"
+#include "myriad_infer_request.h"
+
+namespace vpu {
+namespace MyriadPlugin {
+
+class MyriadAsyncInferRequest : virtual public InferenceEngine::AsyncInferRequestThreadSafeDefault {
+public:
+    MyriadAsyncInferRequest(MyriadInferRequest::Ptr request,
+                                const InferenceEngine::ITaskExecutor::Ptr &taskExecutorStart,
+                                const InferenceEngine::ITaskExecutor::Ptr &taskExecutorGetResult,
+                                const InferenceEngine::TaskSynchronizer::Ptr &taskSynchronizer,
+                                const InferenceEngine::ITaskExecutor::Ptr &callbackExecutor);
+
+    InferenceEngine::StagedTask::Ptr createAsyncRequestTask() override;
+
+    ~MyriadAsyncInferRequest();
+private:
+    MyriadInferRequest::Ptr _request;
+    InferenceEngine::ITaskExecutor::Ptr _taskExecutorGetResult;
+};
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_config.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_config.cpp
new file mode 100644 (file)
index 0000000..001b38e
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+#include <vpu/vpu_plugin_config.hpp>
+#include <cpp_interfaces/exception2status.hpp>
+
+#include "myriad_config.h"
+
+using namespace vpu;
+using namespace vpu::MyriadPlugin;
+
+MyriadConfig::MyriadConfig(const std::map<std::string, std::string> &config, ConfigMode mode) : ParsedConfig(mode)  {
+    configure(parse(config));
+
+    platform = UNKNOWN_PLATFORM;
+    const std::unordered_map<std::string, ncDevicePlatform_t> platforms = {
+        { VPU_CONFIG_VALUE(2450), MYRIAD_2 },
+        { VPU_CONFIG_VALUE(2480), MYRIAD_X }
+    };
+
+    setOption(platform, platforms, config, VPU_CONFIG_KEY(PLATFORM));
+
+    const std::unordered_map<std::string, int> switches = {
+        { CONFIG_VALUE(YES), 1000 },
+        { CONFIG_VALUE(NO), 0 }
+    };
+
+    setOption(watchdogInterval, switches, config, VPU_CONFIG_KEY(WATCHDOG));
+
+#ifndef NDEBUG
+    if (auto envVar = std::getenv("IE_VPU_WATCHDOG_INTERVAL")) {
+        watchdogInterval = std::stoi(envVar);
+    }
+#endif
+}
+
+void MyriadConfig::checkInvalidValues(const std::map<std::string, std::string> &config) const {
+    ParsedConfig::checkInvalidValues(config);
+    checkSupportedValues({{VPU_CONFIG_KEY(PLATFORM), {VPU_CONFIG_VALUE(2450), VPU_CONFIG_VALUE(2480)}}}, config);
+}
+
+std::unordered_set<std::string> MyriadConfig::getRuntimeOptions() const {
+    auto runtimeOptions = ParsedConfig::getRuntimeOptions();
+    runtimeOptions.insert({VPU_CONFIG_KEY(PLATFORM)});
+    runtimeOptions.insert({VPU_CONFIG_KEY(WATCHDOG)});
+    return runtimeOptions;
+}
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_config.h b/inference-engine/src/vpu/myriad_plugin/myriad_config.h
new file mode 100644 (file)
index 0000000..ad297f2
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <unordered_set>
+
+#include <vpu/parsed_config.hpp>
+
+#include <mvnc.h>
+
+namespace vpu {
+namespace MyriadPlugin {
+
+struct MyriadConfig final : ParsedConfig {
+    ncDevicePlatform_t platform;
+    int  watchdogInterval = 1000;
+    explicit MyriadConfig(const std::map<std::string, std::string> &config = std::map<std::string, std::string>(),
+                          ConfigMode mode = ConfigMode::DEFAULT_MODE);
+
+private:
+    std::unordered_set<std::string> getRuntimeOptions() const final;
+    void checkInvalidValues(const std::map<std::string, std::string> &config) const final;
+};
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
new file mode 100644 (file)
index 0000000..b03f2b1
--- /dev/null
@@ -0,0 +1,130 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <utility>
+
+#include <myriad_executable_network.h>
+#include <vpu/blob_reader.hpp>
+#include <net_pass.h>
+
+using namespace InferenceEngine;
+
+namespace vpu {
+namespace MyriadPlugin {
+
+ExecutableNetwork::ExecutableNetwork(ICNNNetwork &network, std::vector<DevicePtr> &devicePool,
+                                     const std::map<std::string, std::string> &config) {
+    _config = std::make_shared<MyriadConfig>(config);
+
+    _log = std::make_shared<Logger>("MyriadPlugin", _config->logLevel, consoleOutput());
+
+    _executor = std::make_shared<MyriadExecutor>(_config->forceReset, _config->vpuLogLevel, _log);
+    _device = _executor->openDevice(devicePool, _config);
+
+    // ignore hardware optimization config for MYRIAD2, it is always disabled
+    if (_device->_platform == MYRIAD_2) {
+        _config->compileConfig.hwOptimization = false;
+    }
+
+    bool ti_proc_ok = !NetPass::CombineRNNSeq(network) ? NetPass::UnrollTI(network) : true;
+    if (!ti_proc_ok)
+        THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
+                              "None TI optimization pattern has been applied successfully";
+
+    auto compiledGraph = compileNetwork(
+        network,
+        static_cast<Platform>(_device->_platform),
+        _config->compileConfig,
+        std::make_shared<Logger>("GraphCompiler", _config->logLevel, consoleOutput()));
+
+    _graphBlob = std::move(compiledGraph->blob);
+    _stagesMetaData = std::move(compiledGraph->stagesMeta);
+
+    _inputInfo  = std::move(compiledGraph->inputInfo);
+    _outputInfo = std::move(compiledGraph->outputInfo);
+
+    if (!_device->isBooted()) {
+        return;
+    }
+
+    char networkName[1024] = {};
+    network.getName(networkName, sizeof(networkName));
+    _executor->allocateGraph(_device, _graphDesc, _graphBlob, compiledGraph->blobHeader, compiledGraph->numActiveStages, networkName);
+    if (_config->exclusiveAsyncRequests) {
+        ExecutorManager *executorManager = ExecutorManager::getInstance();
+        _taskExecutor = executorManager->getExecutor(
+                TargetDeviceInfo::name(TargetDevice::eMYRIAD));
+    }
+
+    for (size_t i = 0; i < _maxTaskExecutorGetResultCount; i++) {
+        std::stringstream idStream;
+        idStream << networkName << "_TaskExecutorGetResult" << i;
+        _taskExecutorGetResultIds.emplace(idStream.str());
+    }
+}
+
+ExecutableNetwork::ExecutableNetwork(const std::string &blobFilename,
+                           std::vector<DevicePtr> &devicePool,
+                           const std::map<std::string, std::string> &config) {
+    _config = std::make_shared<MyriadConfig>(config, ConfigMode::RUNTIME_MODE);
+
+    _log = std::make_shared<Logger>("MyriadPlugin", _config->logLevel, consoleOutput());
+
+    _executor = std::make_shared<MyriadExecutor>(_config->forceReset, _config->vpuLogLevel, _log);
+    _device = _executor->openDevice(devicePool, _config);
+
+    // ignore hardware optimization config for MYRIAD2, it is always disabled
+    if (_device->_platform == MYRIAD_2) {
+        _config->compileConfig.hwOptimization = false;
+    }
+
+    std::ifstream blobFile(blobFilename, std::ios::binary);
+    std::ostringstream blobContentStream;
+    blobContentStream << blobFile.rdbuf();
+    const std::string& blobContentString = blobContentStream.str();
+    std::copy(blobContentString.begin(), blobContentString.end(), std::back_inserter(_graphBlob));
+
+    if (!_device->isBooted()) {
+        return;
+    }
+
+    // TODO: better name
+    char networkName[1024] = "importedNetwork";
+
+    BlobReader blobReader;
+    blobReader.parse(_graphBlob);
+
+    this->_networkInputs  = blobReader.getNetworkInputs();
+    this->_networkOutputs = blobReader.getNetworkOutputs();
+    std::size_t numStages = blobReader.getStageCount();
+    auto blobHeader = blobReader.getHeader();
+
+
+    _inputInfo  = blobReader.getInputInfo();
+    _outputInfo = blobReader.getOutputInfo();
+
+    _executor->allocateGraph(_device, _graphDesc, _graphBlob, blobHeader, numStages, networkName);
+
+    _stagesMetaData.resize(numStages);
+    for (auto &meta : _stagesMetaData) {
+        meta.stageName = meta.stageType = meta.layerName = meta.layerType = "UNKNOWN";
+        meta.status = InferenceEngineProfileInfo::LayerStatus::EXECUTED;
+    }
+
+    if (_config->exclusiveAsyncRequests) {
+        ExecutorManager *executorManager = ExecutorManager::getInstance();
+        _taskExecutor = executorManager->getExecutor(
+                TargetDeviceInfo::name(TargetDevice::eMYRIAD));
+    }
+
+    for (size_t i = 0; i < _maxTaskExecutorGetResultCount; i++) {
+        std::stringstream idStream;
+        idStream << networkName << "_TaskExecutorGetResult" << i;
+        _taskExecutorGetResultIds.emplace(idStream.str());
+    }
+}
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h
new file mode 100644 (file)
index 0000000..f981112
--- /dev/null
@@ -0,0 +1,125 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>
+#include <queue>
+#include <sstream>
+#include <fstream>
+
+#include <ie_common.h>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include <cpp_interfaces/ie_executor_manager.hpp>
+
+#include <vpu/graph_transformer.hpp>
+#include <vpu/parsed_config.hpp>
+
+#include "myriad_executor.h"
+#include "myriad_executable_network.h"
+#include "myriad_infer_request.h"
+#include "myriad_async_infer_request.h"
+#include "myriad_config.h"
+
+namespace vpu {
+namespace MyriadPlugin {
+
+class ExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeDefault {
+public:
+    typedef std::shared_ptr<ExecutableNetwork> Ptr;
+
+    explicit ExecutableNetwork(InferenceEngine::ICNNNetwork &network,
+                               std::vector<DevicePtr> &devicePool,
+                               const std::map<std::string, std::string> &config);
+
+
+    explicit ExecutableNetwork(const std::string &blobFilename,
+                               std::vector<DevicePtr> &devicePool,
+                               const std::map<std::string, std::string> &config);
+
+    virtual ~ExecutableNetwork() {
+        try {
+            _executor->deallocateGraph(_device, _graphDesc);
+        }
+        catch (...) {
+            std::cerr << "ERROR ~ExecutableNetwork():\n"
+                      << "Some errors occurred during the calling of the deallocateGraph() method";
+        }
+    }
+
+    InferenceEngine::InferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
+                                                                      InferenceEngine::OutputsDataMap networkOutputs) override {
+        return std::make_shared<MyriadInferRequest>(_graphDesc, networkInputs, networkOutputs,
+                                                    _inputInfo, _outputInfo,
+                                                    _stagesMetaData, _config, _log, _executor);
+    }
+
+    void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override {
+        if (!_device->isBooted()) {
+            THROW_IE_EXCEPTION << "Can not create infer request: there is no available devices with platform "
+                               << _device->_platform;
+        }
+
+        auto syncRequestImpl = std::make_shared<MyriadInferRequest>(_graphDesc, _networkInputs, _networkOutputs,
+                                                                    _inputInfo, _outputInfo,
+                                                                    _stagesMetaData, _config, _log,
+                                                                    _executor);
+        syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
+        auto taskExecutorGetResult = getNextTaskExecutor();
+        auto asyncTreadSafeImpl = std::make_shared<MyriadAsyncInferRequest>(
+                syncRequestImpl, _taskExecutor, taskExecutorGetResult, _taskSynchronizer, _callbackExecutor);
+        asyncRequest.reset(new InferenceEngine::InferRequestBase<InferenceEngine::AsyncInferRequestThreadSafeDefault>(
+                           asyncTreadSafeImpl),
+                           [](InferenceEngine::IInferRequest *p) { p->Release(); });
+        asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
+    }
+
+    void Export(const std::string &modelFileName) override {
+        std::ofstream modelFile(modelFileName, std::ios::out | std::ios::binary);
+
+        if (modelFile.is_open()) {
+            modelFile.write(_graphBlob.data(), _graphBlob.size());
+        } else {
+            THROW_IE_EXCEPTION << "The " << modelFileName << " file can not be opened for export";
+        }
+    }
+
+    void GetMappedTopology(
+            std::map<std::string, std::vector<InferenceEngine::PrimitiveInfo::Ptr>> &deployedTopology) override {
+        THROW_IE_EXCEPTION << "GetMappedTopology is not implemented\n";
+    }
+
+private:
+    Logger::Ptr _log;
+    MyriadExecutorPtr _executor;
+    std::vector<char> _graphBlob;
+    GraphDesc _graphDesc;
+    DevicePtr _device;
+    std::vector<StageMetaInfo> _stagesMetaData;
+    std::shared_ptr<MyriadConfig> _config;
+
+    DataInfo _inputInfo;
+    DataInfo _outputInfo;
+
+    const size_t _maxTaskExecutorGetResultCount = 1;
+    std::queue<std::string> _taskExecutorGetResultIds;
+
+    InferenceEngine::ITaskExecutor::Ptr getNextTaskExecutor() {
+        std::string id = _taskExecutorGetResultIds.front();
+
+        _taskExecutorGetResultIds.pop();
+        _taskExecutorGetResultIds.push(id);
+
+        InferenceEngine::ExecutorManager *executorManager = InferenceEngine::ExecutorManager::getInstance();
+        InferenceEngine::ITaskExecutor::Ptr taskExecutor = executorManager->getExecutor(id);
+
+        return taskExecutor;
+    }
+};
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executor.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_executor.cpp
new file mode 100644 (file)
index 0000000..1f6de05
--- /dev/null
@@ -0,0 +1,413 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <mutex>
+#include <map>
+#include <algorithm>
+#include <utility>
+
+#include <mvnc.h>
+#include <ie_common.h>
+#include <thread>
+
+#include <vpu/vpu_plugin_config.hpp>
+#include <vpu/utils/extra.hpp>
+#include <vpu/utils/logger.hpp>
+
+#include "myriad_executor.h"
+#include "myriad_config.h"
+
+#ifndef _WIN32
+# include <libgen.h>
+# include <dlfcn.h>
+#endif
+
+using namespace vpu::MyriadPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::VPUConfigParams;
+using namespace std;
+using namespace vpu;
+
+static std::mutex device_mutex;
+
+MyriadExecutor::MyriadExecutor(bool forceReset, const LogLevel& vpuLogLevel, const Logger::Ptr& log) : _log(log) {
+    int ncLogLevel;
+    switch (vpuLogLevel) {
+        case LogLevel::Warning:
+            ncLogLevel = 2;
+            break;
+        case LogLevel::Info:
+            ncLogLevel = 1;
+            break;
+        case LogLevel::Debug:
+            ncLogLevel = 0;
+            break;
+        default:
+            ncLogLevel = 3;
+            break;
+    }
+
+    int reset_all = forceReset;
+    char * tmp = std::getenv("VPU_FORCE_RESET");
+    if (tmp) {
+        std::string env = tmp;
+        if (env == "0")
+            reset_all = 0;
+        else if (env == "1")
+            reset_all = 1;
+    }
+
+    auto status = ncGlobalSetOption(NC_RW_RESET_ALL, &reset_all, sizeof(reset_all));
+    if (status != NC_OK) {
+        _log->warning("failed to set RESET_ALL flag: %d with error: %s\n",
+                    ncLogLevel,
+                    ncStatusToStr(nullptr, status));
+    }
+
+    status = ncGlobalSetOption(NC_RW_LOG_LEVEL, &ncLogLevel, sizeof(ncLogLevel));
+    if (status != NC_OK) {
+        _log->warning("failed to set log level: %d with error: %s\n",
+                    ncLogLevel,
+                    ncStatusToStr(nullptr, status));
+    }
+}
+
+/*
+ * @brief Boot available device
+ */
+ncStatus_t MyriadExecutor::bootNextDevice(std::vector<DevicePtr> &devicePool,
+                                          const ncDevicePlatform_t &configPlatform,
+                                          int watchdogInterval) {
+// #-17972, #-16790
+#if defined(USE_PCIE) || defined(NO_BOOT)
+    if (!devicePool.empty()) {
+        _log->info("PCIe and NO_BOOT support only one device");
+        return NC_DEVICE_NOT_FOUND;
+    }
+#endif
+    int lastDeviceIdx = devicePool.empty() ? -1 : devicePool.back()->_deviceIdx;
+
+    ncStatus_t statusOpen = NC_ERROR;
+
+    DeviceDesc device;
+
+    char* dirName = nullptr;
+
+#if !defined(_WIN32)
+    Dl_info info;
+    dladdr(&device_mutex, &info);
+    char* dli_fname = nullptr;
+
+    if (info.dli_fname != nullptr) {
+        dli_fname = strdup(info.dli_fname);
+        dirName = dirname(dli_fname);
+    }
+#endif
+
+    // Open new device with specific path to FW folder
+    statusOpen = ncDeviceOpen(&device._deviceHandle, configPlatform, watchdogInterval, dirName);
+
+#if !defined(_WIN32)
+    if (info.dli_fname != nullptr) {
+        free(dli_fname);
+    }
+#endif
+
+    if (statusOpen != NC_OK) {
+        ncDeviceClose(&device._deviceHandle);
+        return statusOpen;
+    }
+
+    unsigned int dataLength;
+
+    // Get device platform
+    ncStatus_t status = ncDeviceGetOption(device._deviceHandle, NC_RO_DEVICE_PLATFORM,
+                               reinterpret_cast<void*>(&device._platform), &dataLength);
+    if (status != NC_OK || dataLength != sizeof(device._platform)) {
+        _log->warning("Failed to get device platform");
+        ncDeviceClose(&device._deviceHandle);
+        return status != NC_OK ? status : NC_ERROR;     // for dataLength error
+    }
+
+    // Get device max executors
+    status = ncDeviceGetOption(device._deviceHandle, NC_RO_DEVICE_MAX_GRAPH_NUM,
+                               reinterpret_cast<void*>(&device._maxExecutors), &dataLength);
+    if (status != NC_OK || dataLength != sizeof(device._maxExecutors)) {
+        _log->warning("Failed to get maximum supported number of graphs");
+        ncDeviceClose(&device._deviceHandle);
+        return status != NC_OK ? status : NC_ERROR;     // for dataLength error
+    }
+
+    /* TODO: what should we do if we do not know maximum available graphs? What if we got number <= 0? */
+    device._executors = 1;
+    device._deviceIdx = lastDeviceIdx + 1;
+    devicePool.push_back(std::make_shared<DeviceDesc>(device));
+    return NC_OK;
+}
+
+DevicePtr MyriadExecutor::openDevice(std::vector<DevicePtr> &devicePool,
+                                     const std::shared_ptr<MyriadConfig> &config) {
+    std::lock_guard<std::mutex> lock(device_mutex);
+
+    auto firstBootedButEmptyDevice = std::find_if(devicePool.begin(), devicePool.end(),
+        [&config](const DevicePtr &device) {
+            bool isFromConfig = config->platform == UNKNOWN_PLATFORM ? true : device->_platform == config->platform;
+            return device->isBooted() && device->isEmpty() && isFromConfig;
+        });
+
+    if (firstBootedButEmptyDevice != devicePool.end()) {
+        auto &device = *firstBootedButEmptyDevice;
+        device->_executors = 1;
+        return device;
+    }
+
+    ncStatus_t booted = bootNextDevice(devicePool, config->platform, config->watchdogInterval);
+
+    // TODO Is any tests for this case?
+    // In case, then there is no another not booted device, use already booted with minimum number of executors
+    if (booted != NC_OK) {
+        std::vector<DevicePtr> availableDevices;
+        // Get all suitable devices
+        std::copy_if(devicePool.begin(), devicePool.end(), std::back_inserter(availableDevices),
+            [&config](const DevicePtr &device) {
+                bool isFromConfig = config->platform == UNKNOWN_PLATFORM ? true : device->_platform == config->platform;
+                return !device->isEmpty() && device->isAvailable() && isFromConfig;
+            });
+
+        // Return mock device. If try infer with it, exception will be thrown
+        if (availableDevices.empty() && config->platform != UNKNOWN_PLATFORM) {
+            DeviceDesc device;
+            device._platform = config->platform;
+            return std::make_shared<DeviceDesc>(device);
+        } else if (availableDevices.empty()) {
+            THROW_IE_EXCEPTION << "Can not init USB device: " << ncStatusToStr(nullptr, booted);
+        }
+
+        auto deviceWithMinExecutors = std::min_element(availableDevices.begin(), availableDevices.end(),
+            [](const DevicePtr &lhs, const DevicePtr &rhs) { return lhs->_executors < rhs->_executors; });
+
+        auto &device = *deviceWithMinExecutors;
+        device->_executors++;
+        return device;
+    }
+
+    _log->info("Device #%d %s allocated", devicePool.size() - 1,
+        devicePool.back()->_platform == MYRIAD_X ? "MYRIAD-X" : "MYRIAD-2");
+
+    return devicePool.back();
+}
+
+VPU_PACKED(bin_header {
+    int32_t  magic;
+    uint32_t frequency;
+};)
+
+void MyriadExecutor::closeDevices(std::vector<DevicePtr> &devicePool) {
+    std::lock_guard<std::mutex> lock(device_mutex);
+    for (auto &device : devicePool) {
+        if (device->_deviceHandle != nullptr) {
+            auto res = ncDeviceClose(&(device->_deviceHandle));
+            if (res != NC_OK)
+                printf("ncDeviceClose failed (%d)\n", static_cast<int>(res));
+            device->_deviceHandle = nullptr;
+        }
+    }
+}
+
+void MyriadExecutor::allocateGraph(DevicePtr &device, GraphDesc &graphDesc,
+                                   const std::vector<char> &graphFileContent,
+                                   const std::pair<const char*, size_t> &graphHeaderDesc,
+                                   size_t numStages, const char* networkName) {
+    _numStages = numStages;
+    if (device->_deviceHandle == nullptr) {
+        THROW_IE_EXCEPTION << "Failed to allocate graph: MYRIAD device is not opened.";
+    }
+
+    ncStatus_t status;
+
+    status = ncGraphCreate(networkName, &graphDesc._graphHandle);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to init graph: " << ncStatusToStr(nullptr, status);
+    }
+
+    int executors = device->_platform == MYRIAD_X ? 2 : 1;
+    status = ncGraphSetOption(graphDesc._graphHandle, NC_RW_GRAPH_EXECUTORS_NUM, &executors, sizeof(executors));
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to set graph executors: " << ncStatusToStr(nullptr, status);
+    }
+
+    status = ncGraphAllocate(device->_deviceHandle,
+                             graphDesc._graphHandle,
+                             graphFileContent.data(),
+                             static_cast<unsigned int>(graphFileContent.size()),
+                             graphHeaderDesc.first,
+                             graphHeaderDesc.second);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to allocate graph: " << ncStatusToStr(nullptr, status);
+    }
+
+    unsigned int dataLength = sizeof(int);
+
+    int numInputs = 0;
+    status = ncGraphGetOption(graphDesc._graphHandle, NC_RO_GRAPH_INPUT_COUNT, &numInputs, &dataLength);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to get number of inputs: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+    if (numInputs != 1) {
+        THROW_IE_EXCEPTION << "Unsupported number of inputs: " << numInputs;
+    }
+
+    int numOutputs = 0;
+    status = ncGraphGetOption(graphDesc._graphHandle, NC_RO_GRAPH_OUTPUT_COUNT, &numOutputs, &dataLength);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to get number of outputs: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+    if (numOutputs != 1) {
+        THROW_IE_EXCEPTION << "Unsupported number of outputs: " << numOutputs;
+    }
+
+    dataLength = sizeof(ncTensorDescriptor_t);
+    status = ncGraphGetOption(graphDesc._graphHandle, NC_RO_GRAPH_INPUT_TENSOR_DESCRIPTORS, &graphDesc._inputDesc,
+                              &dataLength);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to get input description: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    status = ncGraphGetOption(graphDesc._graphHandle, NC_RO_GRAPH_OUTPUT_TENSOR_DESCRIPTORS, &graphDesc._outputDesc,
+                              &dataLength);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to get output description: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    unsigned int fifo_elements = 4;
+
+    status = ncFifoCreate("input", NC_FIFO_HOST_WO, &graphDesc._inputFifoHandle);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to init input FIFO: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    status = ncFifoAllocate(graphDesc._inputFifoHandle, device->_deviceHandle, &graphDesc._inputDesc, fifo_elements);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to create input FIFO: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    status = ncFifoCreate("output", NC_FIFO_HOST_RO, &graphDesc._outputFifoHandle);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to init output FIFO: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    status = ncFifoAllocate(graphDesc._outputFifoHandle, device->_deviceHandle, &graphDesc._outputDesc, fifo_elements);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to create output FIFO: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+}
+
+void MyriadExecutor::queueInference(GraphDesc &graphDesc, void *input_data, size_t input_bytes,
+                    void *result_data, size_t result_bytes) {
+#ifndef NDEBUG
+    if (auto dumpFileName = std::getenv("IE_VPU_DUMP_INPUT_FILE_NAME")) {
+        std::ofstream file(dumpFileName, std::ios_base::binary | std::ios_base::out);
+        if (!file.is_open()) {
+            THROW_IE_EXCEPTION << "[VPU] Cannot open file " << dumpFileName << " for writing";
+        }
+        file.write(static_cast<const char*>(input_data), input_bytes);
+    }
+#endif
+
+    if (graphDesc._inputDesc.totalSize != input_bytes) {
+        THROW_IE_EXCEPTION << "Input has unexpected size " << input_bytes << ", expected "
+                           << graphDesc._inputDesc.totalSize;
+    }
+
+    ncStatus_t status = ncGraphQueueInferenceWithFifoElem(graphDesc._graphHandle,
+                                graphDesc._inputFifoHandle, graphDesc._outputFifoHandle,
+                                input_data, &graphDesc._inputDesc.totalSize, nullptr);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to queue inference: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+
+    if (result_data != nullptr && result_bytes != 0) {
+        getResult(graphDesc, result_data, result_bytes);
+    }
+}
+
+void MyriadExecutor::getResult(GraphDesc &graphDesc, void *result_data, unsigned int result_bytes) {
+    ncStatus_t status;
+    void *userParam = nullptr;
+    status = ncFifoReadElem(graphDesc._outputFifoHandle, result_data, &result_bytes, &userParam);
+    if (status != NC_OK) {
+        THROW_IE_EXCEPTION << "Failed to read output from FIFO: " << ncStatusToStr(graphDesc._graphHandle, status);
+    }
+}
+
+void MyriadExecutor::deallocateGraph(DevicePtr &device, GraphDesc &graphDesc) {
+    std::lock_guard<std::mutex> lock(device_mutex);
+
+    if (graphDesc._inputFifoHandle != nullptr) {
+        auto res = ncFifoDestroy(&graphDesc._inputFifoHandle);
+        if (res != NC_OK)
+            _log->warning("ncFifoDelete result %s", ncStatusToStr(nullptr, res));
+        graphDesc._inputFifoHandle = nullptr;
+    }
+    if (graphDesc._outputFifoHandle != nullptr) {
+        auto res = ncFifoDestroy(&graphDesc._outputFifoHandle);
+        if (res != NC_OK)
+            _log->warning("ncFifoDelete result %s", ncStatusToStr(nullptr, res));
+        graphDesc._outputFifoHandle = nullptr;
+    }
+    if (graphDesc._graphHandle != nullptr) {
+        auto res = ncGraphDestroy(&graphDesc._graphHandle);
+        if (res !=NC_OK)
+            _log->warning("Deallocate Graph result %s.", ncStatusToStr(nullptr, res));
+        graphDesc._graphHandle = nullptr;
+    }
+    if (device->_deviceHandle != nullptr) {
+        device->_executors -= 1;
+    }
+}
+
+std::string MyriadExecutor::ncStatusToStr(ncGraphHandle_t *graphHandle, ncStatus_t status) {
+#define MVNC_STATUS_TO_STR(E) case E: return #E;
+    switch (status) {
+        MVNC_STATUS_TO_STR(NC_OK)
+        MVNC_STATUS_TO_STR(NC_BUSY)
+        MVNC_STATUS_TO_STR(NC_ERROR)
+        MVNC_STATUS_TO_STR(NC_OUT_OF_MEMORY)
+        MVNC_STATUS_TO_STR(NC_DEVICE_NOT_FOUND)
+        MVNC_STATUS_TO_STR(NC_INVALID_PARAMETERS)
+        MVNC_STATUS_TO_STR(NC_TIMEOUT)
+        MVNC_STATUS_TO_STR(NC_MVCMD_NOT_FOUND)
+        MVNC_STATUS_TO_STR(NC_NOT_ALLOCATED)
+        MVNC_STATUS_TO_STR(NC_UNAUTHORIZED)
+        MVNC_STATUS_TO_STR(NC_UNSUPPORTED_FEATURE)
+        MVNC_STATUS_TO_STR(NC_UNSUPPORTED_GRAPH_FILE)
+        MVNC_STATUS_TO_STR(NC_UNSUPPORTED_CONFIGURATION_FILE)
+        case NC_MYRIAD_ERROR: {
+            if (graphHandle == nullptr) {
+                return "NC_MYRIAD_ERROR";
+            } else {
+                auto debugInfo = getGraphInfo<char>(graphHandle, NC_RO_GRAPH_DEBUG_INFO, NC_DEBUG_BUFFER_SIZE);
+                if (debugInfo.empty()) {
+                    return "NC_MYRIAD_ERROR";
+                } else {
+                    return std::string(debugInfo.begin(), debugInfo.end());
+                }
+            }
+        }
+        default:
+            return "UNKNOWN MVNC STATUS";
+    }
+#undef MVNC_STATUS_TO_STR
+}
+
+void MyriadExecutor::printThrottlingStatus() {
+// TODO: enable when needed
+}
+
+std::vector<float> MyriadExecutor::getPerfTimeInfo(ncGraphHandle_t *graphHandle) {
+    return getGraphInfo<float>(graphHandle, NC_RO_GRAPH_TIME_TAKEN, _numStages + 2);
+}
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executor.h b/inference-engine/src/vpu/myriad_plugin/myriad_executor.h
new file mode 100644 (file)
index 0000000..c8201b9
--- /dev/null
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <map>
+#include <iomanip>
+#include <utility>
+
+#include <mvnc.h>
+
+#include <myriad_config.h>
+
+namespace vpu {
+namespace MyriadPlugin {
+
+struct GraphDesc {
+    ncGraphHandle_t *_graphHandle = nullptr;
+
+    ncTensorDescriptor_t _inputDesc = {};
+    ncTensorDescriptor_t _outputDesc = {};
+
+    ncFifoHandle_t *_inputFifoHandle = nullptr;
+    ncFifoHandle_t *_outputFifoHandle = nullptr;
+};
+
+struct DeviceDesc {
+    int _executors = 0;
+    int _maxExecutors = 0;
+    ncDevicePlatform_t _platform = UNKNOWN_PLATFORM;
+    int _deviceIdx = -1;
+    ncDeviceHandle_t *_deviceHandle = nullptr;
+
+    bool isBooted() const {
+        return _deviceHandle != nullptr;
+    }
+    bool isEmpty() const {
+        return _executors == 0;
+    }
+    bool isAvailable() const {
+        return _executors < _maxExecutors;
+    }
+};
+
+typedef std::shared_ptr<DeviceDesc> DevicePtr;
+
+
+class MyriadExecutor {
+    Logger::Ptr _log;
+    unsigned int _numStages = 0;
+
+public:
+    MyriadExecutor(bool forceReset, const LogLevel& vpuLogLevel, const Logger::Ptr& log);
+    ~MyriadExecutor() = default;
+
+    /**
+     * @brief Get myriad device
+     * @return Already booted and empty device or new booted device
+     */
+    DevicePtr openDevice(std::vector<DevicePtr> &devicePool, const std::shared_ptr<MyriadConfig> &config);
+
+    static void closeDevices(std::vector<DevicePtr> &devicePool);
+
+    void allocateGraph(DevicePtr &device,
+                       GraphDesc &graphDesc,
+                       const std::vector<char> &graphFileContent,
+                       const std::pair<const char*, size_t> &graphHeaderDesc,
+                       size_t numStages,
+                       const char* networkName);
+
+    void deallocateGraph(DevicePtr &device, GraphDesc &graphDesc);
+
+    void queueInference(GraphDesc &graphDesc, void *input_data, size_t input_bytes,
+                        void *result_data, size_t result_bytes);
+
+    void getResult(GraphDesc &graphDesc, void *result_data, unsigned int result_bytes);
+
+    std::string ncStatusToStr(ncGraphHandle_t *graphHandle, ncStatus_t status);
+
+    std::vector<float> getPerfTimeInfo(ncGraphHandle_t *graphHandle);
+
+    void printThrottlingStatus();
+
+    template<typename T>
+    std::vector<T> getGraphInfo(
+            ncGraphHandle_t* graphHandle,
+            int graphOption,
+            int numElems) {
+        std::vector<T> out(numElems);
+
+        unsigned int infoByteSize = numElems * sizeof(T);
+        if (ncGraphGetOption(graphHandle, graphOption, out.data(), &infoByteSize) != NC_OK) {
+            out.clear();
+        }
+
+        return out;
+    }
+
+private:
+    /**
+     * @brief Try to boot any available device
+     * @param configPlatform Boot the selected platform
+     */
+    ncStatus_t bootNextDevice(std::vector<DevicePtr> &devicePool, const ncDevicePlatform_t &configPlatform, int watchdogInterval);
+};
+
+typedef std::shared_ptr<MyriadExecutor> MyriadExecutorPtr;
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp
new file mode 100644 (file)
index 0000000..82f8bbc
--- /dev/null
@@ -0,0 +1,192 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#define NOMINMAX
+#include <ie_blob.h>
+#include <ie_plugin.hpp>
+#include <description_buffer.hpp>
+#include <debug.h>
+#include <ie_layouts.h>
+#include <precision_utils.h>
+
+#include <vpu/utils/perf_report.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+
+#include "myriad_executable_network.h"
+#include "myriad_infer_request.h"
+
+using namespace vpu;
+using namespace vpu::MyriadPlugin;
+using namespace InferenceEngine;
+
+#define MEMCPY(dst, src, bytes) std::copy_n((src), (bytes), (dst))
+
+MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
+                                        InferenceEngine::InputsDataMap networkInputs,
+                                        InferenceEngine::OutputsDataMap networkOutputs,
+                                        DataInfo& inputInfo,
+                                        DataInfo& outputInfo,
+                                        const std::vector<StageMetaInfo> &blobMetaData,
+                                        const std::shared_ptr<MyriadConfig> &myriadConfig,
+                                        const Logger::Ptr &log,
+                                        const MyriadExecutorPtr &executor) :
+        InferRequestInternal(networkInputs, networkOutputs), _executor(executor),
+        _log(log), _stagesMetaData(blobMetaData), _config(myriadConfig),
+        _inputInfo(inputInfo), _outputInfo(outputInfo),
+        _graphDesc(graphDesc) {
+    _deviceLayout = _config->compileConfig.hwOptimization ? NCHW : NHWC;
+    if (_config->compileConfig.forceLayout == ComputeLayout::NCHW)
+        _deviceLayout = NCHW;
+    if (_config->compileConfig.forceLayout == ComputeLayout::NHWC)
+        _deviceLayout = NHWC;
+    // allocate inputs
+    for (auto &networkInput : _networkInputs) {
+        // TODO: use TensorDesc instead of deprecated methods
+        SizeVector dims      = networkInput.second->getDims();
+        Precision  precision = networkInput.second->getInputPrecision();
+        Layout     layout    = networkInput.second->getTensorDesc().getLayout();
+
+        if (precision != Precision::FP32 &&
+            precision != Precision::FP16 &&
+            precision != Precision::U8) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input precision: "
+                                   << precision << "! Supported precisions: FP32, FP16 and U8";
+        }
+        Blob::Ptr inputBlob = make_blob_with_precision(precision, layout, dims);
+
+        // allocate the input blob
+        // TODO We are allocating temporary input buffer of enough size. Wrap this buffer in blobs
+        inputBlob->allocate();
+        _inputs[networkInput.first] = inputBlob;
+    }
+    // allocate outputs
+    for (auto &networkOutput : _networkOutputs) {
+        SizeVector dims      = networkOutput.second->dims;
+        Precision  precision = networkOutput.second->getPrecision();
+        Layout     layout    = networkOutput.second->layout;
+
+        if (precision != Precision::FP32 &&
+            precision != Precision::FP16) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output precision: "
+                                << precision << "! Supported precisions: FP32, FP16";
+        }
+        Blob::Ptr outputBlob = make_blob_with_precision(precision, layout, dims);
+        // allocate the output blob
+        outputBlob->allocate();
+        _outputs[networkOutput.first] = outputBlob;
+    }
+
+    inputBuffer .resize(inputInfo.totalSize);
+    resultBuffer.resize(outputInfo.totalSize);
+
+    if (_networkOutputs.empty() || _networkInputs.empty()) {
+        THROW_IE_EXCEPTION << "Internal error: no information about network's output/input";
+    }
+}
+
+void MyriadInferRequest::InferImpl() {
+    InferAsync();
+    GetResult();
+}
+
+void MyriadInferRequest::InferAsync() {
+    for (auto input : _inputs) {
+        auto const inputBlobPtr = input.second;
+        if (inputBlobPtr->precision() != Precision::FP16
+            && inputBlobPtr->precision() != Precision::FP32
+            && inputBlobPtr->precision() != Precision::U8)
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input blob precision";
+    }
+    for (auto output : _outputs) {
+        auto const outputBlobPtr = output.second;
+        if (outputBlobPtr->precision() != Precision::FP16
+            && outputBlobPtr->precision() != Precision::FP32)
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output blob precision";
+    }
+
+    // execute input pre-processing
+    execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
+
+    Blob::Ptr tmpBlob;
+
+    void* inputPtr = nullptr;
+    size_t inputSize = _inputInfo.totalSize;
+
+    if (_inputs.size() > 1) {
+        for (auto&& input : _inputs) {
+            auto inputBlob = input.second;
+            size_t byteSize = inputBlob->byteSize();
+            Layout layout = inputBlob->getTensorDesc().getLayout();
+            if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
+                // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
+                inputBlob = copyBlob(inputBlob, _deviceLayout);
+            }
+
+            const auto input_offset_it = _inputInfo.offset.find(input.first);
+            if (input_offset_it != _inputInfo.offset.end()) {
+                size_t required_buff_size = checked_cast<size_t>(input_offset_it->second) + byteSize;
+                IE_ASSERT(required_buff_size <= inputBuffer.size());
+                MEMCPY(&inputBuffer[input_offset_it->second], inputBlob->buffer().as<uint8_t*>(), byteSize);
+            }
+        }
+
+        inputPtr = inputBuffer.data();
+    } else {
+        auto dataName = _networkInputs.begin()->first;
+        auto foundInputBlob = _inputs.find(dataName);
+        if (foundInputBlob == _inputs.end())
+            THROW_IE_EXCEPTION << "Error: input [" << dataName << "] is not provided.";
+
+        tmpBlob = foundInputBlob->second;
+        Layout layout = tmpBlob->getTensorDesc().getLayout();
+        if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
+            // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
+            tmpBlob = copyBlob(tmpBlob, _deviceLayout);
+        }
+
+        inputPtr = tmpBlob->buffer();
+    }
+
+    _executor->queueInference(_graphDesc, inputPtr, inputSize, nullptr, 0);
+}
+
+void MyriadInferRequest::GetResult() {
+    _executor->getResult(_graphDesc, resultBuffer.data(), resultBuffer.size());
+
+    for (auto pp : _outputs) {
+        const auto offset_it = _outputInfo.offset.find(pp.first);
+
+        if (offset_it !=  _outputInfo.offset.end()) {
+            size_t resultOffset = checked_cast<size_t>(offset_it->second);
+            if (resultOffset > resultBuffer.size()) {
+                THROW_IE_EXCEPTION << "unexpected result data size";
+            }
+
+            auto outputBlob = pp.second;
+            auto outDesc = outputBlob->getTensorDesc();
+
+            // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
+            auto vpuLayout = (outDesc.getLayout() == NCHW || outDesc.getLayout() == NHWC) ? _deviceLayout : outDesc.getLayout();
+            ie::TensorDesc tempTensorDesc(outDesc.getPrecision(), outDesc.getDims(), vpuLayout);
+            auto tmpBlob = make_blob_with_precision(tempTensorDesc, resultBuffer.data() + resultOffset);
+
+            copyBlob(tmpBlob, outputBlob);
+        }
+    }
+}
+
+void MyriadInferRequest::GetPerformanceCounts(std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
+    auto perfInfo = _executor->getPerfTimeInfo(_graphDesc._graphHandle);
+
+    if (_log->level() >= LogLevel::Info) {
+        if (!perfInfo.empty()) {
+            _log->info("** Device execution time %f **", perfInfo[perfInfo.size()- 1]);
+        }
+    }
+
+    perfMap = vpu::parsePerformanceReport(
+        _stagesMetaData,
+        perfInfo.data(), perfInfo.size(),
+        _config->perfReport, _config->printReceiveTensorTime);
+}
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h
new file mode 100644 (file)
index 0000000..b9e56e9
--- /dev/null
@@ -0,0 +1,60 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+
+#include <ie_common.h>
+#include <cpp_interfaces/impl/ie_infer_request_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
+
+#include <vpu/utils/logger.hpp>
+
+#include "myriad_executor.h"
+#include "myriad_config.h"
+
+namespace vpu {
+namespace MyriadPlugin {
+
+class MyriadInferRequest : public InferenceEngine::InferRequestInternal {
+    MyriadExecutorPtr _executor;
+    InferenceEngine::Layout _deviceLayout;
+    Logger::Ptr _log;
+    std::vector<StageMetaInfo> _stagesMetaData;
+    std::shared_ptr<MyriadConfig> _config;
+
+    const DataInfo _inputInfo;
+    const DataInfo _outputInfo;
+
+    GraphDesc _graphDesc;
+    std::vector<uint8_t> resultBuffer;
+    std::vector<uint8_t> inputBuffer;
+
+public:
+    typedef std::shared_ptr<MyriadInferRequest> Ptr;
+
+    explicit MyriadInferRequest(GraphDesc &_graphDesc,
+                                InferenceEngine::InputsDataMap networkInputs,
+                                InferenceEngine::OutputsDataMap networkOutputs,
+                                DataInfo& inputInfo,
+                                DataInfo& outputInfo,
+                                const std::vector<StageMetaInfo> &blobMetaData,
+                                const std::shared_ptr<MyriadConfig> &myriadConfig,
+                                const Logger::Ptr &log,
+                                const MyriadExecutorPtr &executor);
+
+    void InferImpl() override;
+    void InferAsync();
+    void GetResult();
+
+    void
+    GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
+};
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp
new file mode 100644 (file)
index 0000000..2395d7c
--- /dev/null
@@ -0,0 +1,115 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+#include <vector>
+
+#include <inference_engine.hpp>
+#include <cpp_interfaces/base/ie_plugin_base.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
+
+#include <vpu/vpu_plugin_config.hpp>
+#include <vpu/parsed_config.hpp>
+
+#include "myriad_plugin.h"
+
+using namespace InferenceEngine;
+using namespace vpu::MyriadPlugin;
+
+ExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(ICNNNetwork &network,
+                                                          const std::map<std::string, std::string> &config) {
+    if (network.getPrecision() != Precision::FP16 &&
+        network.getPrecision() != Precision::FP32) {
+        THROW_IE_EXCEPTION << "The plugin does not support networks with " << network.getPrecision() << " format.\n"
+                           << "Supported format: FP32 and FP16.";
+    }
+
+    InputsDataMap networkInputs;
+    OutputsDataMap networkOutputs;
+
+    network.getInputsInfo(networkInputs);
+    network.getOutputsInfo(networkOutputs);
+
+    auto specifiedDevice = network.getTargetDevice();
+    auto supportedDevice = InferenceEngine::TargetDevice::eMYRIAD;
+    if (specifiedDevice != InferenceEngine::TargetDevice::eDefault && specifiedDevice != supportedDevice) {
+        THROW_IE_EXCEPTION << "The plugin doesn't support target device: " << getDeviceName(specifiedDevice) << ".\n" <<
+                           "Supported target device: " << getDeviceName(supportedDevice);
+    }
+
+    for (auto networkInput : networkInputs) {
+        auto input_precision = networkInput.second->getInputPrecision();
+
+        if (input_precision != Precision::FP16
+            && input_precision != Precision::FP32
+            && input_precision != Precision::U8) {
+            THROW_IE_EXCEPTION << "Input image format " << input_precision << " is not supported yet.\n"
+                               << "Supported formats:F16, FP32 and U8.";
+        }
+    }
+
+    // override what was set globally for plugin, otherwise - override default config without touching config for plugin
+    auto configCopy = _config;
+    for (auto &&entry : config) {
+        configCopy[entry.first] = entry.second;
+    }
+
+    return std::make_shared<ExecutableNetwork>(network, _devicePool, configCopy);
+}
+
+void Engine::SetConfig(const std::map<std::string, std::string> &userConfig) {
+    MyriadConfig myriadConfig(userConfig);
+
+    for (auto &&entry : userConfig) {
+        _config[entry.first] = entry.second;
+    }
+}
+
+void Engine::QueryNetwork(const ICNNNetwork& network, QueryNetworkResult& res) const {
+    QueryNetwork(network, {}, res);
+}
+
+void Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string, std::string>& config,
+                          QueryNetworkResult& res) const {
+    auto layerNames = getSupportedLayers(
+        network,
+        Platform::MYRIAD_2,
+        CompilationConfig(),
+        std::make_shared<Logger>("GraphCompiler", LogLevel::None, consoleOutput()));
+
+    res.supportedLayers.insert(layerNames.begin(), layerNames.end());
+}
+
+Engine::Engine() {
+    MyriadConfig config;
+    _config = config.getDefaultConfig();
+}
+
+// TODO: ImportNetwork and LoadNetwork handle the config parameter in different ways.
+// ImportNetwork gets a config provided by an user. LoadNetwork gets the plugin config and merge it with user's config.
+// Need to found a common way to handle configs
+IExecutableNetwork::Ptr Engine::ImportNetwork(const std::string &modelFileName, const std::map<std::string, std::string> &config) {
+    std::ifstream blobFile(modelFileName, std::ios::binary);
+
+    if (!blobFile.is_open()) {
+        THROW_IE_EXCEPTION << details::as_status << NETWORK_NOT_READ;
+    }
+
+    IExecutableNetwork::Ptr executableNetwork;
+    // Use config provided by an user ignoring default config
+    executableNetwork.reset(new ExecutableNetworkBase<ExecutableNetworkInternal>(
+                                std::make_shared<ExecutableNetwork>(modelFileName, _devicePool, config)), [](details::IRelease *p) {p->Release();});
+
+    return executableNetwork;
+}
+
+INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept {
+    try {
+        plugin = make_ie_compatible_plugin({1, 6, CI_BUILD_NUMBER, "myriadPlugin"}, std::make_shared<Engine>());
+        return OK;
+    }
+    catch (std::exception &ex) {
+        return DescriptionBuffer(GENERAL_ERROR, resp) << ex.what();
+    }
+}
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h
new file mode 100644 (file)
index 0000000..499797c
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "inference_engine.hpp"
+#include "description_buffer.hpp"
+#include "myriad_executable_network.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+
+namespace vpu {
+namespace MyriadPlugin {
+
+class Engine : public InferenceEngine::InferencePluginInternal {
+public:
+    Engine();
+
+    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network,
+                                                                       const std::map<std::string, std::string> &config) override;
+
+    /**
+     * @brief myriad plugin runs reshape internally so it needs reshapable network
+     * @param network
+     * @return
+     */
+    InferenceEngine::ICNNNetwork&  RemoveConstLayers(InferenceEngine::ICNNNetwork &network) override {
+        return network;
+    }
+
+    void SetConfig(const std::map<std::string, std::string> &config) override;
+    /**
+     * @depricated Use the version with config parameter
+     */
+    void QueryNetwork(const InferenceEngine::ICNNNetwork& network, InferenceEngine::QueryNetworkResult& res) const override;
+    void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
+                      const std::map<std::string, std::string>& config, InferenceEngine::QueryNetworkResult& res) const override;
+
+    InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName, const std::map<std::string, std::string> &config) override;
+
+    ~Engine() {
+        MyriadExecutor::closeDevices(_devicePool);
+    }
+
+private:
+    std::vector<DevicePtr> _devicePool;
+};
+
+}  // namespace MyriadPlugin
+}  // namespace vpu
index 9c1e197..b5a2deb 100644 (file)
@@ -45,12 +45,15 @@ if(WIN32)
     target_include_directories(${TARGET_NAME} PUBLIC "${IE_MAIN_SOURCE_DIR}/samples/common")
 endif()
 
-target_link_libraries(${TARGET_NAME} PUBLIC inference_engine ${PUGI})
+target_link_libraries(${TARGET_NAME} PUBLIC ${PUGI})
 
-target_include_directories(${TARGET_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}" "${gtest_SOURCE_DIR}/include"
-                                                 "${IE_MAIN_SOURCE_DIR}/src" "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src"
+target_include_directories(${TARGET_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
+                                                 $<TARGET_PROPERTY:inference_engine,INTERFACE_INCLUDE_DIRECTORIES>
+                                                 "${gtest_SOURCE_DIR}/include"
+                                                 "${IE_MAIN_SOURCE_DIR}/src"
+                                                 "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src"
                                                  "${gmock_SOURCE_DIR}/include"
-                                                                                                     PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
+                                         PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
 
 # TODO: eliminate dependency on samples
 target_include_directories(${TARGET_NAME} PUBLIC 
index 3679d29..90266d2 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "ir_gen_helper.hpp"
index 6c3f3be..0dfa802 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index 88a8815..5a1397a 100644 (file)
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// dllmain.cpp : Defines the entry point for the DLL application.
+
 #ifdef _WIN32
 #define _WINSOCKAPI_
 #include <windows.h>
index 008e9a0..68c5fe7 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <random>
index 9c0f539..d9fdaf9 100644 (file)
@@ -62,6 +62,26 @@ if (ENABLE_MKL_DNN)
     source_group("mkldnn" FILES ${MKLDNN_TESTS} ${MKLDNN_TESTS_INCLUDE})
 endif ()
 
+if (ENABLE_VPU)
+    # Disable MVNC Unit tests for PCIE
+    if (NOT ENABLE_MYX_PCIE)
+        file(GLOB VPU_MVNC_TESTS engines/vpu/mvnc/*cpp)
+    endif()
+    file(GLOB
+            VPU_TESTS
+            engines/vpu/*cpp
+            )
+    include_directories(
+            ${IE_MAIN_SOURCE_DIR}/thirdparty/movidius
+            ${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/XLink/shared
+            ${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/watchdog
+            ${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/WinPthread
+    )
+
+    list(APPEND TEST_SRC ${VPU_TESTS})
+    source_group("vpu" FILES ${VPU_TESTS})
+endif()
+
 file(GLOB
         TEST_INCLUDE
         shape_infer/*.hpp)
@@ -88,6 +108,10 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 target_compile_options(${TARGET_NAME} PRIVATE $<$<CXX_COMPILER_ID:Clang>: -Wno-inconsistent-missing-override >)
 target_compile_options(${TARGET_NAME} PRIVATE $<$<CXX_COMPILER_ID:AppleClang>: -Wno-inconsistent-missing-override >)
 
+if (ENABLE_VPU)
+    target_link_libraries(${TARGET_NAME} PRIVATE mvnc vpu_graph_transformer_test_static)
+endif ()
+
 target_link_libraries(${TARGET_NAME} PRIVATE
     gtest
     gtest_main
index 40e1595..d07fd5b 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index d912b26..00cf92f 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 3c2ba90..f332936 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 1905096..d5de764 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index c098bd6..f56ea15 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index a8e7bf5..58001b9 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e636be9..9c308a3 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -92,7 +92,7 @@ TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongNMSThreshold)
     layer.setNumClasses(2);
     layer.setShareLocation(true);
     layer.setBackgroudLabelId(-1);
-    layer.setNMSThreshold(0);  // here
+    layer.setNMSThreshold(-0.02);  // here
     layer.setTopK(400);
     layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE");
     layer.setVariantEncodedInTarget(false);
@@ -112,6 +112,6 @@ TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongConfidenceThre
     layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE");
     layer.setVariantEncodedInTarget(false);
     layer.setKeepTopK(200);
-    layer.setConfidenceThreshold(0);  // here
+    layer.setConfidenceThreshold(-0.1);  // here
     ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException);
-}
\ No newline at end of file
+}
index d85595a..780f57d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -74,13 +74,13 @@ TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithOneInputPort) {
     ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException);
 }
 
-TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithThreeInputPort) {
+TEST_F(EltwiseLayerBuilderTest, canCreateLayerWithThreeInputPort) {
     Builder::Network net("network");
     Builder::EltwiseLayer layer("Eltwise layer");
 
     layer.setInputPorts({Port({1, 2, 3, 4}), Port({1, 2, 3, 4}), Port({1, 2, 3, 4})});   // here
     layer.setOutputPort(Port({1, 2, 3, 4}));
-    ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException);
+    ASSERT_NO_THROW(net.addLayer(layer));
 }
 
 TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithDifferentInputPorts) {
index 4ddbda3..5943b82 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 01cf448..5b24520 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 72f2581..8c8996d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 809f2b1..fdd7281 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index dc2b91b..a77d62f 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index a0e9340..00cab75 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index a05a5d9..e7d437a 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0e37aa5..4258ff8 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 2ae9968..ebfce47 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index d06687e..fc9b808 100644 (file)
@@ -1,34 +1,7 @@
-/*
-* INTEL CONFIDENTIAL
-* Copyright (C) 2018-2019 Intel Corporation.
-*
-* The source code contained or described herein and all documents
-* related to the source code ("Material") are owned by Intel Corporation
-* or its suppliers or licensors. Title to the Material remains with
-* Intel Corporation or its suppliers and licensors. The Material may
-* contain trade secrets and proprietary and confidential information
-* of Intel Corporation and its suppliers and licensors, and is protected
-* by worldwide copyright and trade secret laws and treaty provisions.
-* No part of the Material may be used, copied, reproduced, modified,
-* published, uploaded, posted, transmitted, distributed, or disclosed
-* in any way without Intel's prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other
-* intellectual property right is granted to or conferred upon you by
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual
-* property rights must be express and approved by Intel in writing.
-*
-* Include any supplier copyright notices as supplier requires Intel to use.
-*
-* Include supplier trademarks or logos as supplier requires Intel to use,
-* preceded by an asterisk. An asterisked footnote can be added as follows:
-* *Third Party trademarks are the property of their respective owners.
-*
-* Unless otherwise agreed by Intel in writing, you may not remove or alter
-* this notice or any other notice embedded in Materials by Intel or Intel's
-* suppliers or licensors in any way.
-*/
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
 #include <gtest/gtest.h>
 #include <xml_net_builder.hpp>
 #include <inference_engine/cnn_network_impl.hpp>
index 2de6472..af283c0 100644 (file)
@@ -1,36 +1,8 @@
-#include <utility>
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
-/*
-* INTEL CONFIDENTIAL
-* Copyright (C) 2018-2019 Intel Corporation.
-*
-* The source code contained or described herein and all documents
-* related to the source code ("Material") are owned by Intel Corporation
-* or its suppliers or licensors. Title to the Material remains with
-* Intel Corporation or its suppliers and licensors. The Material may
-* contain trade secrets and proprietary and confidential information
-* of Intel Corporation and its suppliers and licensors, and is protected
-* by worldwide copyright and trade secret laws and treaty provisions.
-* No part of the Material may be used, copied, reproduced, modified,
-* published, uploaded, posted, transmitted, distributed, or disclosed
-* in any way without Intel's prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other
-* intellectual property right is granted to or conferred upon you by
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual
-* property rights must be express and approved by Intel in writing.
-*
-* Include any supplier copyright notices as supplier requires Intel to use.
-*
-* Include supplier trademarks or logos as supplier requires Intel to use,
-* preceded by an asterisk. An asterisked footnote can be added as follows:
-* *Third Party trademarks are the property of their respective owners.
-*
-* Unless otherwise agreed by Intel in writing, you may not remove or alter
-* this notice or any other notice embedded in Materials by Intel or Intel's
-* suppliers or licensors in any way.
-*/
+#include <utility>
 
 #include <gtest/gtest.h>
 #include <tests_common.hpp>
index 45420d6..4514225 100644 (file)
@@ -1,34 +1,6 @@
-/*
-* INTEL CONFIDENTIAL
-* Copyright (C) 2018-2019 Intel Corporation.
-*
-* The source code contained or described herein and all documents
-* related to the source code ("Material") are owned by Intel Corporation
-* or its suppliers or licensors. Title to the Material remains with
-* Intel Corporation or its suppliers and licensors. The Material may
-* contain trade secrets and proprietary and confidential information
-* of Intel Corporation and its suppliers and licensors, and is protected
-* by worldwide copyright and trade secret laws and treaty provisions.
-* No part of the Material may be used, copied, reproduced, modified,
-* published, uploaded, posted, transmitted, distributed, or disclosed
-* in any way without Intel's prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other
-* intellectual property right is granted to or conferred upon you by
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual
-* property rights must be express and approved by Intel in writing.
-*
-* Include any supplier copyright notices as supplier requires Intel to use.
-*
-* Include supplier trademarks or logos as supplier requires Intel to use,
-* preceded by an asterisk. An asterisked footnote can be added as follows:
-* *Third Party trademarks are the property of their respective owners.
-*
-* Unless otherwise agreed by Intel in writing, you may not remove or alter
-* this notice or any other notice embedded in Materials by Intel or Intel's
-* suppliers or licensors in any way.
-*/
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #include <iostream>
 #include <map>
index 87198f6..a020769 100644 (file)
@@ -1,34 +1,6 @@
-/*
-* INTEL CONFIDENTIAL
-* Copyright (C) 2018-2019 Intel Corporation.
-*
-* The source code contained or described herein and all documents
-* related to the source code ("Material") are owned by Intel Corporation
-* or its suppliers or licensors. Title to the Material remains with
-* Intel Corporation or its suppliers and licensors. The Material may
-* contain trade secrets and proprietary and confidential information
-* of Intel Corporation and its suppliers and licensors, and is protected
-* by worldwide copyright and trade secret laws and treaty provisions.
-* No part of the Material may be used, copied, reproduced, modified,
-* published, uploaded, posted, transmitted, distributed, or disclosed
-* in any way without Intel's prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other
-* intellectual property right is granted to or conferred upon you by
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual
-* property rights must be express and approved by Intel in writing.
-*
-* Include any supplier copyright notices as supplier requires Intel to use.
-*
-* Include supplier trademarks or logos as supplier requires Intel to use,
-* preceded by an asterisk. An asterisked footnote can be added as follows:
-* *Third Party trademarks are the property of their respective owners.
-*
-* Unless otherwise agreed by Intel in writing, you may not remove or alter
-* this notice or any other notice embedded in Materials by Intel or Intel's
-* suppliers or licensors in any way.
-*/
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #ifndef SHAPES_H
 #define SHAPES_H
index cb6e800..5f82bcc 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index 70229c6..37fdd4d 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index faf574e..c8a5345 100644 (file)
@@ -1,19 +1,6 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
-//
-
 
 #include <vector>
 #include <gtest/gtest.h>
index d83c1c3..8d21d59 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include "gna_plugin/gna_allocator.hpp"
index 2dfd288..963ab78 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0223fc0..24ddf8d 100644 (file)
@@ -1,24 +1,6 @@
-//*****************************************************************************
-//
-// INTEL CONFIDENTIAL
 // Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
-// The source code contained or described herein and all documents related
-// to the source code ("Material") are owned by Intel Corporation or its suppliers
-// or licensors. Title to the Material remains with Intel Corporation or its suppliers
-// and licensors. The Material contains trade secrets and proprietary
-// and confidential information of Intel or its suppliers and licensors.
-// The Material is protected by worldwide copyright and trade secret laws and treaty
-// provisions. No part of the Material may be used, copied, reproduced, modified,
-// published, uploaded, posted, transmitted, distributed, or disclosed in any way
-// without Intel's prior express written permission.
-//
-// No license under any patent, copyright, trade secret or other intellectual
-// property right is granted to or conferred upon you by disclosure or delivery
-// of the Materials, either expressly, by implication, inducement, estoppel
-// or otherwise. Any license under such intellectual property rights must
-// be express and approved by Intel in writing.
-//*****************************************************************************
 
 #define INTEL_GNA_DLLEXPORT 1
 #include <gna-api.h>
index de937d2..3ff8aa5 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0add255..dbf883a 100644 (file)
@@ -1,35 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
-
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #include <vector>
 #include <gtest/gtest.h>
index c9f4bce..3b91a9e 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index d776c03..716306b 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <gtest/gtest.h>
index 016ae35..e453748 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <mock_icnn_network.hpp>
index cd3680c..707ccdf 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index 3c46c50..52f82c0 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index 20a60c7..a6e98a0 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index 7373c98..2ebf521 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index 865649f..2469ed2 100644 (file)
@@ -1,35 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
-
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #include <vector>
 #include <gtest/gtest.h>
index 27725d6..57b0394 100644 (file)
@@ -1,35 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
-
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #include <vector>
 #include <gtest/gtest.h>
index cf42599..9b9cf52 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
index db64350..83aa061 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 
index 4c32f33..72beaff 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include "nnet_base_matcher.hpp"
index e2bb023..3c95f54 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include"gna-api.h"
index d46ab30..caac874 100644 (file)
@@ -1,36 +1,8 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
- #pragma once
+#pragma once
 
 
 class OutputFiller : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
index f45f9ee..5c6169d 100644 (file)
@@ -1,37 +1,8 @@
-#include <utility>
-
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
+#include <utility>
 #pragma once
 
 #include <gmock/gmock-matchers.h>
index 267777c..7f15023 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 
index e9b6ae9..c932063 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 
index 1d04fad..c4aca1d 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include "nnet_base_matcher.hpp"
index 1efba3c..d8064a7 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include "nnet_base_matcher.hpp"
index c55cad8..3f0fa92 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include <cmath>
index 3c50f85..356c68b 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 #include"gna-api.h"
index 0ab9a07..eb7ce28 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #include "test_irs.hpp"
 
index c0194dc..1019b33 100644 (file)
@@ -1,34 +1,6 @@
-/*
- * INTEL CONFIDENTIAL
- * Copyright (C) 2018-2019 Intel Corporation.
- *
- * The source code contained or described herein and all documents
- * related to the source code ("Material") are owned by Intel Corporation
- * or its suppliers or licensors. Title to the Material remains with
- * Intel Corporation or its suppliers and licensors. The Material may
- * contain trade secrets and proprietary and confidential information
- * of Intel Corporation and its suppliers and licensors, and is protected
- * by worldwide copyright and trade secret laws and treaty provisions.
- * No part of the Material may be used, copied, reproduced, modified,
- * published, uploaded, posted, transmitted, distributed, or disclosed
- * in any way without Intel's prior express written permission.
- *
- * No license under any patent, copyright, trade secret or other
- * intellectual property right is granted to or conferred upon you by
- * disclosure or delivery of the Materials, either expressly, by implication,
- * inducement, estoppel or otherwise. Any license under such intellectual
- * property rights must be express and approved by Intel in writing.
- *
- * Include any supplier copyright notices as supplier requires Intel to use.
- *
- * Include supplier trademarks or logos as supplier requires Intel to use,
- * preceded by an asterisk. An asterisked footnote can be added as follows:
- * *Third Party trademarks are the property of their respective owners.
- *
- * Unless otherwise agreed by Intel in writing, you may not remove or alter
- * this notice or any other notice embedded in Materials by Intel or Intel's
- * suppliers or licensors in any way.
- */
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
 
 #pragma once
 
index 25ec76c..6d22f2b 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <gtest/gtest.h>
index 0fc2eff..27e2760 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2016-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #include <gtest/gtest.h>
diff --git a/inference-engine/tests/unit/engines/vpu/adjust_data_location_tests.cpp b/inference-engine/tests/unit/engines/vpu/adjust_data_location_tests.cpp
new file mode 100644 (file)
index 0000000..3a29937
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/allocator.hpp>
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/utility.hpp>
+#include <vpu/utils/numeric.hpp>
+
+#include "graph_transformer_tests.hpp"
+
+using VPU_AdjustDataLocationTest = VPU_GraphTransformerTest;
+
+//                                            -> [Data 2] -> (4/SW) -> [Output 1]
+//                               -> (2/Split)
+//                                            -> [Data 3] -> (5/SW) -> [Output 2]
+// [Input] -> (1/HW) -> [Data 1]
+//                                            -> [Data 4] -> (6/SW) -> [Output 3]
+//                               -> (3/Split)
+//                                            -> [Data 5] -> (7/SW) -> [Output 4]
+//
+// In order to allocate SHAVEs for SW Stages we need to move [Data 1] to DDR and redirect its consumers.
+//
+
+TEST_F(VPU_AdjustDataLocationTest, FlushCMX_TwoSpecialConsumers) {
+    config.numSHAVEs = 1;
+    config.numCMXSlices = 1;
+    InitCompileEnv();
+
+    vpu::DataDesc dataDesc1(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {vpu::CMX_SLICE_SIZE / (2 * 2), 1, 2, 1});
+    vpu::DataDesc dataDesc2(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {vpu::CMX_SLICE_SIZE / (2 * 2), 1, 1, 1});
+
+    auto model = CreateModel();
+
+    auto input = model->addInputData("Input", dataDesc1);
+    model->attrs().set<int>("numInputs", 1);
+
+    auto output1 = model->addOutputData("Output 1", dataDesc2);
+    auto output2 = model->addOutputData("Output 2", dataDesc2);
+    auto output3 = model->addOutputData("Output 3", dataDesc2);
+    auto output4 = model->addOutputData("Output 4", dataDesc2);
+    model->attrs().set<int>("numOutputs", 4);
+
+    auto data1 = model->addNewData("Data 1", dataDesc1);
+    auto data2 = model->addNewData("Data 2", dataDesc2);
+    auto data3 = model->addNewData("Data 3", dataDesc2);
+    auto data4 = model->addNewData("Data 4", dataDesc2);
+    auto data5 = model->addNewData("Data 5", dataDesc2);
+
+    auto fake = model->addFakeData();
+
+    auto hwStage = model->addNewStage<vpu::MyriadXHwStage>(
+        "1/HW",
+        vpu::StageType::MyriadXHwOp,
+        nullptr,
+        {input, fake, fake, fake},
+        {data1});
+    hwStage->attrs().set<vpu::HwOpType>("hwOpType", vpu::HwOpType::POOL);
+
+    stageBuilder->addSplitStage(model, "2/Split", nullptr, vpu::Dim::C, data1, {data2, data3});
+    stageBuilder->addSplitStage(model, "3/Split", nullptr, vpu::Dim::C, data1, {data4, data5});
+
+    stageBuilder->addSoftMaxStage(model, "4/SW", nullptr, data2, output1, vpu::Dim::W);
+    stageBuilder->addSoftMaxStage(model, "5/SW", nullptr, data3, output2, vpu::Dim::W);
+    stageBuilder->addSoftMaxStage(model, "6/SW", nullptr, data4, output3, vpu::Dim::W);
+    stageBuilder->addSoftMaxStage(model, "7/SW", nullptr, data5, output4, vpu::Dim::W);
+
+    vpu::PassSet pipeline;
+    pipeline.addPass(passManager->dumpModel("initial"));
+    pipeline.addPass(passManager->adjustDataLayout());
+    pipeline.addPass(passManager->dumpModel("adjustDataLayout"));
+    pipeline.addPass(passManager->processSpecialStages());
+    pipeline.addPass(passManager->dumpModel("processSpecialStages"));
+    pipeline.addPass(passManager->adjustDataLocation());
+    pipeline.addPass(passManager->dumpModel("adjustDataLocation"));
+    pipeline.addPass(passManager->finalCheck());
+
+    pipeline.run(model);
+
+    ASSERT_EQ(data1->location(), vpu::DataLocation::CMX);
+    ASSERT_EQ(data1->numConsumers(), 1);
+
+    auto data1Consumer = data1->singleConsumer();
+    auto data1ConsumerOutput = data1Consumer->output(0);
+    ASSERT_EQ(data1Consumer->type(), vpu::StageType::Copy);
+    ASSERT_EQ(data1ConsumerOutput->location(), vpu::DataLocation::BSS);
+    ASSERT_EQ(data1ConsumerOutput->numChildDatas(), 4);
+    ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data2](const vpu::SharedAllocation& e) { return e->child() == data2; }));
+    ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data3](const vpu::SharedAllocation& e) { return e->child() == data3; }));
+    ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data4](const vpu::SharedAllocation& e) { return e->child() == data4; }));
+    ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data5](const vpu::SharedAllocation& e) { return e->child() == data5; }));
+}
diff --git a/inference-engine/tests/unit/engines/vpu/containers_tests.cpp b/inference-engine/tests/unit/engines/vpu/containers_tests.cpp
new file mode 100644 (file)
index 0000000..eda07b8
--- /dev/null
@@ -0,0 +1,231 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <unordered_map>
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include <vpu/utils/containers.hpp>
+#include <vpu/utils/range.hpp>
+#include <vpu/utils/handle.hpp>
+
+using namespace testing;
+
+namespace {
+
+struct TestStruct final : public vpu::EnableHandleFromThis<TestStruct> {
+    int val = 0;
+    vpu::IntrusivePtrListNode<TestStruct> node1;
+    vpu::IntrusivePtrListNode<TestStruct> node2;
+    explicit TestStruct(int val) : val(val), node1(this), node2(this) {}
+};
+
+}
+
+TEST(VPU_Containers, SmallVector_API) {
+    std::vector<int> vec1;
+    vpu::SmallVector<int, 5> vec2;
+
+    for (size_t i = 0; i < 5; ++i) {
+        vec1.push_back(i);
+        vec2.push_back(i);
+    }
+
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        ASSERT_EQ(vec1.at(i), vec2.at(i));
+    }
+
+    vec1.clear();
+    vec2.clear();
+
+    for (size_t i = 0; i < 5; ++i) {
+        vec1.push_back(i);
+    }
+    vec2.insert(vec2.end(), vec1.begin(), vec1.end());
+
+    auto it1 = std::find(vec1.begin(), vec1.end(), 2);
+    auto it2 = std::find(vec2.begin(), vec2.end(), 2);
+
+    ASSERT_NE(it1, vec1.end());
+    ASSERT_NE(it2, vec2.end());
+
+    vec1.erase(it1);
+    vec2.erase(it2);
+
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        ASSERT_EQ(vec1.at(i), vec2.at(i));
+    }
+
+    vec1.push_back(15);
+    vec1.push_back(16);
+
+    vec2.push_back(15);
+    vec2.push_back(16);
+
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        ASSERT_EQ(vec1.at(i), vec2.at(i));
+    }
+}
+
+TEST(VPU_Containers, SmallVector_Equal) {
+    vpu::SmallVector<int, 5> vec1;
+    vpu::SmallVector<int, 5> vec2;
+    vpu::SmallVector<int, 5> vec3;
+
+    for (size_t i = 0; i < 5; ++i) {
+        vec1.push_back(i);
+        vec2.push_back(i);
+        vec3.push_back(i + 1);
+    }
+
+    ASSERT_EQ(vec1, vec2);
+    ASSERT_NE(vec1, vec3);
+}
+
+TEST(VPU_Containers, SmallVector_Swap) {
+    vpu::SmallVector<int, 5> vec1;
+    vpu::SmallVector<int, 5> vec2;
+
+    for (size_t i = 0; i < 5; ++i) {
+        vec1.push_back(i);
+        vec2.push_back(5 - i);
+    }
+
+    vec1.swap(vec2);
+
+    for (size_t i = 0; i < 5; ++i) {
+        ASSERT_EQ(vec1[i], 5 - i);
+        ASSERT_EQ(vec2[i], i);
+    }
+}
+
+template <class Cont>
+Cont buildTestVector(int contSize) {
+    Cont vec;
+
+    for (int i = 0; i < contSize; ++i) {
+        vec.push_back(i);
+    }
+
+    return vec;
+}
+
+TEST(VPU_Containers, IntrusivePtrList) {
+    const int count = 5;
+    int gold = 0;
+
+    std::vector<std::shared_ptr<TestStruct>> base;
+    for (int i = 0; i < count; ++i) {
+        base.push_back(std::make_shared<TestStruct>(i));
+    }
+
+    vpu::IntrusivePtrList<TestStruct> list1(&TestStruct::node1);
+    vpu::IntrusivePtrList<TestStruct> list2(&TestStruct::node2);
+
+    for (int i = 0; i < count; ++i) {
+        list1.push_back(base[i]);
+    }
+
+    ASSERT_FALSE(list1.empty());
+    ASSERT_TRUE(list2.empty());
+
+    gold = 0;
+    for (const auto& ptr1 : list1) {
+        ASSERT_NE(ptr1, nullptr);
+        ASSERT_EQ(ptr1->val, gold);
+        ASSERT_EQ(ptr1.get(), base[ptr1->val].get());
+        ++gold;
+    }
+    ASSERT_EQ(gold, count);
+
+    for (int i = 0; i < count / 2; ++i) {
+        list2.push_back(base[i]);
+    }
+
+    ASSERT_FALSE(list2.empty());
+
+    gold = 0;
+    for (const auto& ptr2 : list2) {
+        ASSERT_NE(ptr2, nullptr);
+        ASSERT_EQ(ptr2->val, gold);
+        ASSERT_EQ(ptr2.get(), base[ptr2->val].get());
+
+        list1.erase(ptr2);
+
+        ++gold;
+    }
+    ASSERT_EQ(gold, count / 2);
+
+    gold = count / 2;
+    for (const auto& ptr1 : list1) {
+        ASSERT_NE(ptr1, nullptr);
+        ASSERT_EQ(ptr1->val, gold);
+        ASSERT_EQ(ptr1.get(), base[ptr1->val].get());
+        ++gold;
+    }
+    ASSERT_EQ(gold, count);
+}
+
+TEST(VPU_Containers, IntrusivePtrList_MoveFromOneListToAnother) {
+    const int count = 5;
+
+    std::list<std::shared_ptr<TestStruct>> base;
+
+    vpu::IntrusivePtrList<TestStruct> list1(&TestStruct::node1);
+    vpu::IntrusivePtrList<TestStruct> list2(&TestStruct::node1);
+
+    for (int i = 0; i < count; ++i) {
+        auto ptr = std::make_shared<TestStruct>(i);
+        base.push_back(ptr);
+        list1.push_back(ptr);
+    }
+
+    ASSERT_EQ(list1.size(), base.size());
+    ASSERT_TRUE(list2.empty());
+
+    for (const auto& item : list1) {
+        list1.erase(item);
+        list2.push_back(item);
+    }
+
+    ASSERT_TRUE(list1.empty());
+    ASSERT_EQ(list2.size(), base.size());
+}
+
+TEST(VPU_Containers, IntrusivePtrList_ReleaseOrigObject) {
+    const int count = 5;
+    int gold = 0;
+
+    std::list<std::shared_ptr<TestStruct>> base;
+
+    vpu::IntrusivePtrList<TestStruct> list(&TestStruct::node1);
+
+    for (int i = 0; i < count; ++i) {
+        auto ptr = std::make_shared<TestStruct>(i);
+        base.push_back(ptr);
+        list.push_back(ptr);
+    }
+
+    ASSERT_EQ(list.size(), base.size());
+
+    base.pop_front();
+    ASSERT_EQ(list.size(), base.size());
+
+    base.pop_back();
+    ASSERT_EQ(list.size(), base.size());
+
+    list.clear();
+    ASSERT_TRUE(list.empty());
+
+    gold = 0;
+    for (const auto& item : base) {
+        ASSERT_EQ(item->val, gold + 1);
+        ++gold;
+    }
+    ASSERT_EQ(gold, count - 2);
+}
diff --git a/inference-engine/tests/unit/engines/vpu/eliminate_copy_tests.cpp b/inference-engine/tests/unit/engines/vpu/eliminate_copy_tests.cpp
new file mode 100644 (file)
index 0000000..cb4c3f6
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/utility.hpp>
+
+#include "graph_transformer_tests.hpp"
+
+using VPU_EliminateCopyTest = VPU_GraphTransformerTest;
+
+TEST_F(VPU_EliminateCopyTest, OneInputTwoConcats) {
+    InitCompileEnv();
+
+    auto model = CreateModel();
+
+    auto input = model->addInputData(
+        "Input",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+    model->attrs().set<int>("numInputs", 1);
+
+    auto output1 = model->addOutputData(
+        "Output1",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 4, 1}));
+    auto output2 = model->addOutputData(
+        "Output2",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 4, 1}));
+    model->attrs().set<int>("numOutputs", 2);
+
+    auto outputCopy1 = model->duplicateData(output1, "copy");
+    auto outputCopy2 = model->duplicateData(output2, "copy");
+    stageBuilder->addCopyStage(model, outputCopy1->name(), nullptr, outputCopy1, output1);
+    stageBuilder->addCopyStage(model, outputCopy2->name(), nullptr, outputCopy2, output2);
+
+    auto data1 = model->addNewData(
+        "data1",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+
+    auto fake = model->addFakeData();
+
+    auto hwStage = model->addNewStage<vpu::MyriadXHwStage>(
+        "HW",
+        vpu::StageType::MyriadXHwOp,
+        nullptr,
+        {input, fake, fake, fake},
+        {data1});
+    hwStage->attrs().set<vpu::HwOpType>("hwOpType", vpu::HwOpType::POOL);
+
+    auto data2 = model->addNewData(
+        "data2",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 1, 1}));
+    auto data3 = model->addNewData(
+        "data3",
+        vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 1, 1}));
+
+    stageBuilder->addConcatStage(
+        model,
+        "Concat1",
+        nullptr,
+        vpu::Dim::C,
+        {data1, data2},
+        outputCopy1);
+    stageBuilder->addConcatStage(
+        model,
+        "Concat2",
+        nullptr,
+        vpu::Dim::C,
+        {data1, data3},
+        outputCopy2);
+
+    vpu::PassSet pipeline;
+    pipeline.addPass(passManager->dumpModel("initial"));
+    pipeline.addPass(passManager->adjustDataLayout());
+    pipeline.addPass(passManager->dumpModel("adjustDataLayout"));
+    pipeline.addPass(passManager->processSpecialStages());
+    pipeline.addPass(passManager->dumpModel("processSpecialStages"));
+    pipeline.addPass(passManager->adjustDataLocation());
+    pipeline.addPass(passManager->dumpModel("adjustDataLocation"));
+    pipeline.addPass(passManager->eliminateCopyStages());
+    pipeline.addPass(passManager->dumpModel("eliminateCopyStages"));
+    pipeline.addPass(passManager->finalCheck());
+
+    pipeline.run(model);
+
+    const auto& hwOutput = hwStage->output(0);
+    ASSERT_NE(hwOutput->parentDataEdge(), nullptr);
+    ASSERT_EQ(hwOutput->parentData(), outputCopy1);
+
+    ASSERT_EQ(hwOutput->numConsumers(), 2);
+    ASSERT_TRUE(contains(hwOutput->consumers(), [](const vpu::Stage& stage) { return stage->type() == vpu::StageType::Concat; }));
+    ASSERT_TRUE(contains(hwOutput->consumers(), [](const vpu::Stage& stage) { return stage->type() == vpu::StageType::Copy; }));
+}
diff --git a/inference-engine/tests/unit/engines/vpu/find_subgraphs_tests.cpp b/inference-engine/tests/unit/engines/vpu/find_subgraphs_tests.cpp
new file mode 100644 (file)
index 0000000..95a11d2
--- /dev/null
@@ -0,0 +1,149 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/utility.hpp>
+#include <vpu/utils/containers.hpp>
+#include <vpu/utils/range.hpp>
+
+#include "graph_transformer_tests.hpp"
+
+using namespace InferenceEngine;
+
+class VPU_FindSubGraphsTest : public VPU_GraphTransformerTest {
+protected:
+    vpu::PassSet pipeline;
+    vpu::Model::Ptr model;
+
+public:
+    void initSimpleModel() {
+        InitCompileEnv();
+
+        model = CreateModel();
+
+        auto input = model->addInputData(
+                "Input",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+        model->attrs().set<int>("numInputs", 1);
+
+        auto output = model->addOutputData(
+                "Output1",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+        model->attrs().set<int>("numOutputs", 1);
+
+        auto data1 = model->addNewData(
+                "data1",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+
+        auto data2 = model->addNewData(
+                "data2",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+
+        auto data3 = model->addNewData(
+                "data3",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+
+        auto data4 = model->addNewData(
+                "data4",
+                vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+
+        stageBuilder->addPowerStage(model, input->name(), nullptr, 0.0, 1.0, 0.0, input, data1);
+        stageBuilder->addPowerStage(model, data1->name(), nullptr, 0.0, 1.0, 0.0, data1, data2);
+        stageBuilder->addPowerStage(model, data2->name(), nullptr, 0.0, 1.0, 0.0, data2, data3);
+        stageBuilder->addPowerStage(model, data3->name(), nullptr, 0.0, 1.0, 0.0, data3, data4);
+        stageBuilder->addPowerStage(model, data4->name(), nullptr, 0.0, 1.0, 0.0, data4, output);
+
+        pipeline.addPass(passManager->dumpModel("initial"));
+
+        pipeline.addPass(passManager->findSubGraphs());
+        pipeline.addPass(passManager->dumpModel("findSubGraphs"));
+    }
+};
+
+TEST_F(VPU_FindSubGraphsTest, canCallfindSubGraphsPass) {
+    config.numberOfNodesInOneSubGraph = 1;
+    initSimpleModel();
+
+    ASSERT_NO_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_FindSubGraphsTest, canMergeAllStagesInOneSubGraph) {
+    config.numberOfNodesInOneSubGraph = 5;
+    initSimpleModel();
+
+    int maxSubGraphs = 1;
+
+    ASSERT_NO_THROW(pipeline.run(model));
+
+    auto curMaxSubGraphs = model->numberOfSubGraphs();
+    ASSERT_EQ(curMaxSubGraphs, maxSubGraphs);
+}
+
+TEST_F(VPU_FindSubGraphsTest, canSplitGraphToTwoSubGraphs) {
+    config.numberOfNodesInOneSubGraph = 3;
+    initSimpleModel();
+
+    ASSERT_NO_THROW(pipeline.run(model));
+
+    auto curMaxSubGraphs = model->numberOfSubGraphs();
+    ASSERT_EQ(curMaxSubGraphs, 2);
+
+    for (int i = 0; i < curMaxSubGraphs; i++) {
+        auto subGraph = vpu::toVector(model->getSubGraphStages(i));
+        ASSERT_TRUE(subGraph.size() <= 3);
+
+        for (const auto& stage : subGraph) {
+            auto curSubGraph = stage->subGraphNumber();
+            ASSERT_EQ(curSubGraph, i);
+        }
+    }
+}
+
+TEST_F(VPU_FindSubGraphsTest, canGetNextStagesWithCondition) {
+    config.numberOfNodesInOneSubGraph = 3;
+    initSimpleModel();
+
+    ASSERT_NO_THROW(pipeline.run(model));
+
+    auto curMaxSubGraphs = model->numberOfSubGraphs();
+    ASSERT_EQ(curMaxSubGraphs, 2);
+
+    auto subGraph0 = vpu::toVector(model->getSubGraphStages(0));
+    auto stage0 = subGraph0[0];
+
+    auto alwaysTrue = [](const vpu::Stage a) noexcept {
+        return true;
+    };
+    auto res = vpu::toVector(stage0->nextStages(alwaysTrue));
+    auto ref = vpu::toVector(stage0->nextStages());
+
+    ASSERT_EQ(res.size(), ref.size());
+    for (int i = 0; i < res.size(); i++) {
+        ASSERT_EQ(res[i]->name(), ref[i]->name());
+    }
+}
+
+TEST_F(VPU_FindSubGraphsTest, canGetPrevStagesWithCondition) {
+    config.numberOfNodesInOneSubGraph = 3;
+    initSimpleModel();
+
+    ASSERT_NO_THROW(pipeline.run(model));
+
+    auto curMaxSubGraphs = model->numberOfSubGraphs();
+    ASSERT_EQ(curMaxSubGraphs, 2);
+
+    auto subGraph1 = vpu::toVector(model->getSubGraphStages(1));
+    auto stage1 = subGraph1[0];
+
+    auto alwaysTrue = [](const vpu::Stage a)noexcept {
+        return true;
+    };
+    auto res = vpu::toVector(stage1->prevStages(alwaysTrue));
+    auto ref = vpu::toVector(stage1->prevStages());
+
+    ASSERT_EQ(res.size(), ref.size());
+    for (int i = 0; i < res.size(); i++) {
+        ASSERT_EQ(res[i]->name(), ref[i]->name());
+    }
+}
diff --git a/inference-engine/tests/unit/engines/vpu/graph_transformer_tests.cpp b/inference-engine/tests/unit/engines/vpu/graph_transformer_tests.cpp
new file mode 100644 (file)
index 0000000..621295f
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "graph_transformer_tests.hpp"
+
+#include <atomic>
+
+#include <vpu/utils/io.hpp>
+
+void VPU_GraphTransformerTest::SetUp() {
+    ASSERT_NO_FATAL_FAILURE(TestsCommon::SetUp());
+
+    _log = std::make_shared<vpu::Logger>(
+        "Test",
+        vpu::LogLevel::Debug,
+        vpu::consoleOutput());
+
+    stageBuilder = std::make_shared<vpu::StageBuilder>();
+    frontEnd = std::make_shared<vpu::FrontEnd>(stageBuilder);
+    backEnd = std::make_shared<vpu::BackEnd>();
+    passManager = std::make_shared<vpu::PassManager>(stageBuilder, backEnd);
+}
+
+void VPU_GraphTransformerTest::TearDown() {
+    for (const auto& model : _models) {
+        backEnd->dumpModel(model);
+    }
+
+    vpu::CompileEnv::free();
+
+    TestsCommon::TearDown();
+}
+
+void VPU_GraphTransformerTest::InitCompileEnv() {
+    vpu::CompileEnv::init(platform, config, _log);
+}
+
+namespace {
+
+std::atomic<int> g_counter(0);
+
+}
+
+vpu::Model::Ptr VPU_GraphTransformerTest::CreateModel() {
+    const auto& env = vpu::CompileEnv::get();
+
+    auto unitTest = testing::UnitTest::GetInstance();
+    IE_ASSERT(unitTest != nullptr);
+    auto curTestInfo = unitTest->current_test_info();
+    IE_ASSERT(curTestInfo != nullptr);
+
+    auto model = std::make_shared<vpu::Model>(
+        vpu::formatString("%s/%s", curTestInfo->test_case_name(), curTestInfo->name()));
+    model->attrs().set<int>("index", g_counter.fetch_add(1));
+    model->attrs().set<vpu::Resources>("resources", env.resources);
+
+    _models.push_back(model);
+
+    return model;
+}
diff --git a/inference-engine/tests/unit/engines/vpu/graph_transformer_tests.hpp b/inference-engine/tests/unit/engines/vpu/graph_transformer_tests.hpp
new file mode 100644 (file)
index 0000000..9cbef83
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <list>
+
+#include <gtest/gtest.h>
+#include <tests_common.hpp>
+
+#include <vpu/compile_env.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/model/model.hpp>
+#include <vpu/frontend/frontend.hpp>
+#include <vpu/frontend/stage_builder.hpp>
+#include <vpu/pass_manager.hpp>
+#include <vpu/backend/backend.hpp>
+
+class VPU_GraphTransformerTest : public TestsCommon {
+public:
+    vpu::Platform platform = vpu::Platform::MYRIAD_X;
+    vpu::CompilationConfig config;
+
+    vpu::StageBuilder::Ptr stageBuilder;
+    vpu::FrontEnd::Ptr frontEnd;
+    vpu::PassManager::Ptr passManager;
+    vpu::BackEnd::Ptr backEnd;
+
+    void SetUp() override;
+    void TearDown() override;
+
+    void InitCompileEnv();
+
+    vpu::Model::Ptr CreateModel();
+
+private:
+    vpu::Logger::Ptr _log;
+    std::list<vpu::Model::Ptr> _models;
+};
+
+template <class Cont, class Cond>
+bool contains(const Cont& cont, const Cond& cond) {
+    for (const auto& val : cont) {
+        if (cond(val)) {
+            return true;
+        }
+    }
+    return false;
+}
diff --git a/inference-engine/tests/unit/engines/vpu/graph_transformer_tests_constructs.cpp b/inference-engine/tests/unit/engines/vpu/graph_transformer_tests_constructs.cpp
new file mode 100644 (file)
index 0000000..38c5180
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "graph_transformer_tests.hpp"
+
+#include <atomic>
+
+#include <vpu/utils/io.hpp>
+
+
+TEST_F(VPU_GraphTransformerTest, CantConnectInputOutputDatas) {
+    InitCompileEnv();
+
+    auto model = CreateModel();
+
+    auto input = model->addInputData(
+            "Input",
+            vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 3, 1}));
+    model->attrs().set<int>("numInputs", 1);
+
+    auto output = model->addOutputData(
+            "Output",
+            vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {16, 16, 4, 1}));
+    model->attrs().set<int>("numOutputs", 1);
+
+    ASSERT_ANY_THROW(throw 1);
+    ASSERT_ANY_THROW(VPU_THROW_UNLESS(0 == 1));
+
+    ASSERT_ANY_THROW(
+    model->connectDatas()
+        .parent(input)
+        .child(output)
+        .mode(vpu::SharedDataMode::ROI)
+        .order(vpu::SharedDataOrder::ChildWritesToParent)
+        .offset(vpu::DimValues())
+        .done()
+    ) << "Can not short connect arbitrary input/output datas";
+
+    ASSERT_ANY_THROW(throw 1);
+}
+
diff --git a/inference-engine/tests/unit/engines/vpu/mvnc/watchdog_tests.cpp b/inference-engine/tests/unit/engines/vpu/mvnc/watchdog_tests.cpp
new file mode 100644 (file)
index 0000000..45008ac
--- /dev/null
@@ -0,0 +1,400 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <tests_common.hpp>
+#define __PC__
+#include <watchdog/watchdog.h>
+#include <watchdog/watchdogPrivate.hpp>
+#include <mvnc/include/ncPrivateTypes.h>
+#include <thread>
+
+using namespace ::testing;
+using namespace InferenceEngine;
+
+class MockWatchdogDevice : public Watchdog::IDevice {
+ public:
+    using time_point  = Watchdog::IDevice::time_point;
+    MOCK_QUALIFIED_METHOD1(setInterval, noexcept, void(const std::chrono::milliseconds));
+    MOCK_QUALIFIED_METHOD1(keepAlive, noexcept, void(const time_point &));
+    MOCK_QUALIFIED_METHOD1(dueIn, const noexcept, std::chrono::milliseconds (const time_point &current_time));
+    MOCK_QUALIFIED_METHOD0(isTimeout, const noexcept, bool ());
+    MOCK_QUALIFIED_METHOD0(getHandle, const noexcept, void* ());
+};
+
+struct wd_context_opaque_private {
+    void * magic = reinterpret_cast<void *> (0xdeadbeaf);
+    Watchdog::IDevice * actual = nullptr;
+    bool   destroyed = false;
+};
+
+
+class MVNCWatchdogTests: public TestsCommon {
+ protected:
+    devicePrivate_t d;
+    wd_context ctx, ctx1;
+    StrictMock<MockWatchdogDevice> mockWatchee, mockWatchee1;
+    wd_context_opaque_private opaque, opaque1;
+
+    void SetUp() override {
+        opaque.actual = &mockWatchee;
+        ctx.opaque = &opaque;
+
+        opaque1.actual = &mockWatchee1;
+        ctx1.opaque = &opaque1;
+
+        pthread_mutex_init(&d.dev_stream_m, nullptr);
+    }
+    void TearDown() override {
+        pthread_mutex_destroy(&d.dev_stream_m);
+    }
+};
+using ms = std::chrono::milliseconds;
+
+TEST_F(MVNCWatchdogTests, canRegisterExternalWatchee) {
+
+    int handle = 1;
+    EXPECT_CALL(mockWatchee, getHandle()).WillRepeatedly(Return(&handle));
+    // do not expect that  any ping happened before we remove the thread
+    // this can be changed for example registering succeed only if first ping succeed
+    EXPECT_CALL(mockWatchee, keepAlive(_)).Times(AtLeast(0));
+    EXPECT_CALL(mockWatchee, setInterval(ms(1))).Times(1);
+    EXPECT_CALL(mockWatchee, isTimeout()).WillRepeatedly(Return(false));
+    EXPECT_CALL(mockWatchee, dueIn(_)).WillRepeatedly(Return(ms(20000)));
+
+    d.wd_interval = 1;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    // allowing thread spin
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+// TODO: implement logic
+TEST_F(MVNCWatchdogTests, DISABLED_removeDeviceIfXLINKSessionNotIninitialized) {
+
+    d.wd_interval = 10;
+    ASSERT_EQ(WD_ERRNO, watchdog_init_context(&ctx));
+    ASSERT_NE(WD_ERRNO, watchdog_register_device(&ctx, &d));
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+
+TEST_F(MVNCWatchdogTests, canNotBeRegisteredTwice) {
+
+    d.wd_interval = 10;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_init_context(&ctx));
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    ASSERT_NE(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    // allowing thread spin
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+TEST_F(MVNCWatchdogTests, canUnRegisterNotInitialized) {
+
+    ASSERT_EQ(WD_ERRNO, watchdog_init_context(&ctx));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+TEST_F(MVNCWatchdogTests, canUnRegisterIfInterval0) {
+
+    d.wd_interval = 0;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_init_context(&ctx));
+    ASSERT_NE(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+TEST_F(MVNCWatchdogTests, failUnRegisterTwice) {
+
+    d.wd_interval = 10;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_init_context(&ctx));
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    // allowing thread spin
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+    ASSERT_NE(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+TEST_F(MVNCWatchdogTests, canRemoveOneDeviceFromQueueInCaseOfTimeout) {
+    int handle = 1;
+    int x = 0;
+    int y = 0;
+    int z = 0;
+    EXPECT_CALL(mockWatchee, getHandle()).WillRepeatedly(Return(&handle));
+    EXPECT_CALL(mockWatchee, keepAlive(_)).Times(AtLeast(1));
+    EXPECT_CALL(mockWatchee, setInterval(ms(10))).Times(1);
+    EXPECT_CALL(mockWatchee, isTimeout()).WillRepeatedly(Invoke([&z, &y]() {
+        // will sleep at least 100 ms and avoid second keep alive call
+        y = 100;
+        if (!z) {
+            // sleep in watchdog thread, and allowing register second device before deleting first one
+            z = 1;
+            return false;
+        }
+        return true;
+    }));
+    EXPECT_CALL(mockWatchee, dueIn(_)).WillRepeatedly(Invoke([&y](const MockWatchdogDevice::time_point &current_time){
+        return std::chrono::milliseconds(y);
+    }));
+
+    EXPECT_CALL(mockWatchee1, getHandle()).WillRepeatedly(Return(&handle));
+    EXPECT_CALL(mockWatchee1, keepAlive(_)).Times(AtLeast(2));
+    EXPECT_CALL(mockWatchee1, setInterval(ms(10))).Times(1);
+    EXPECT_CALL(mockWatchee1, isTimeout()).WillRepeatedly(Invoke([&x]() {
+        // allow every second time to wait
+        x = x == 0 ? 100 : 0;
+        return false;
+    }));
+    EXPECT_CALL(mockWatchee1, dueIn(_)).WillRepeatedly(Invoke([&x](const MockWatchdogDevice::time_point &current_time){
+        return std::chrono::milliseconds(x);
+    }));
+
+
+    d.wd_interval = 10;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx1, &d));
+
+    std::this_thread::sleep_for(ms(1000));
+
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx1));
+}
+
+TEST_F(MVNCWatchdogTests, canNotStartWatchdogIfIntervalInvalid) {
+
+    opaque.actual = &mockWatchee;
+
+    int handle = 1;
+
+    EXPECT_CALL(mockWatchee, getHandle()).WillRepeatedly(Return(&handle));
+
+    d.wd_interval = 0;
+    ASSERT_NE(WD_ERRNO, watchdog_register_device(&ctx, &d));
+
+    d.wd_interval = -1;
+    ASSERT_NE(WD_ERRNO, watchdog_register_device(&ctx, &d));
+
+    // if fo some reason thread started we will get unxpected updatePongInterval calls
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+TEST_F(MVNCWatchdogTests, canGetPingsOnRegularBasis) {
+
+    int handle = 1;
+    int x = 0;
+    EXPECT_CALL(mockWatchee, getHandle()).WillRepeatedly(Return(&handle));
+    // since interval is small keepAlive can happen several times once
+    EXPECT_CALL(mockWatchee, keepAlive(_)).Times(AtLeast(2));
+    EXPECT_CALL(mockWatchee, setInterval(ms(10))).Times(1);
+    EXPECT_CALL(mockWatchee, isTimeout()).WillRepeatedly(Return(false));
+    EXPECT_CALL(mockWatchee, dueIn(_)).WillRepeatedly(Invoke([&x](const MockWatchdogDevice::time_point &current_time){
+        x = x == 0 ? 100 : 0;
+        return std::chrono::milliseconds(x);
+    }));
+
+
+    d.wd_interval = 10;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+
+    std::this_thread::sleep_for(ms(1000));
+
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+}
+
+TEST_F(MVNCWatchdogTests, canWakeUpWatchdogWhenAddAndRemoveDevice) {
+
+    int handle = 1, handle1 = 2;
+
+    EXPECT_CALL(mockWatchee, getHandle()).WillRepeatedly(Return(&handle));
+    EXPECT_CALL(mockWatchee, keepAlive(_)).Times(1);
+    EXPECT_CALL(mockWatchee, setInterval(ms(10))).Times(1);
+    EXPECT_CALL(mockWatchee, isTimeout()).WillRepeatedly(Return(false));
+    // without wake this will sleep for ever
+    EXPECT_CALL(mockWatchee, dueIn(_)).WillRepeatedly(Return(ms(20000)));
+
+    EXPECT_CALL(mockWatchee1, getHandle()).WillRepeatedly(Return(&handle1));
+    EXPECT_CALL(mockWatchee1, keepAlive(_)).Times(1);
+    EXPECT_CALL(mockWatchee1, setInterval(ms(10))).Times(1);
+    EXPECT_CALL(mockWatchee1, isTimeout()).WillRepeatedly(Return(false));
+    EXPECT_CALL(mockWatchee1, dueIn(_)).WillRepeatedly(Return(ms(20000)));
+
+
+    d.wd_interval = 10;
+
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx, &d));
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx1, &d));
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx));
+    ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx1));
+}
+
+TEST_F(MVNCWatchdogTests, stressWatchDog) {
+
+    const int num_watchdog_device = 10;
+
+    watchdog_init_context(nullptr);
+
+    StrictMock<MockWatchdogDevice> mockWatchee[num_watchdog_device];
+    int handle[num_watchdog_device];
+    wd_context ctx[num_watchdog_device];
+    wd_context_opaque_private opaque[num_watchdog_device];
+
+    for (int i = 0; i != num_watchdog_device; i++) {
+        handle[i] = i;
+
+        EXPECT_CALL(mockWatchee[i], getHandle()).WillRepeatedly(Return(handle + i));
+        // since interval is big keepAlive happens only once
+        EXPECT_CALL(mockWatchee[i], keepAlive(_)).Times(1);
+
+        EXPECT_CALL(mockWatchee[i], setInterval(ms(10))).Times(1);
+        EXPECT_CALL(mockWatchee[i], isTimeout()).WillRepeatedly(Return(false));
+        EXPECT_CALL(mockWatchee[i], dueIn(_)).WillRepeatedly(Return(ms(20000)));
+    }
+
+    d.wd_interval = 10;
+
+    for (int k = 0; k != num_watchdog_device; k++) {
+        opaque[k].actual = &mockWatchee[k];
+        ctx[k].opaque = &opaque[k];
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k], &d));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    for (int k = 0; k != num_watchdog_device; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k]));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+}
+
+TEST_F(MVNCWatchdogTests, stressWatchDog1) {
+
+    const int num_watchdog_device = 10;
+    const int num_watchdog_device_half = num_watchdog_device / 2;
+
+    watchdog_init_context(nullptr);
+
+    StrictMock<MockWatchdogDevice> mockWatchee[num_watchdog_device];
+    int handle[num_watchdog_device];
+    wd_context ctx[num_watchdog_device];
+    wd_context_opaque_private opaque[num_watchdog_device];
+
+    for (int i = 0; i != num_watchdog_device; i++) {
+        handle[i] = i;
+
+        EXPECT_CALL(mockWatchee[i], getHandle()).WillRepeatedly(Return(handle + i));
+        // since interval is big keepAlive happens only once
+        EXPECT_CALL(mockWatchee[i], keepAlive(_)).Times(1);
+
+        EXPECT_CALL(mockWatchee[i], setInterval(ms(10))).Times(1);
+        EXPECT_CALL(mockWatchee[i], isTimeout()).WillRepeatedly(Return(false));
+        EXPECT_CALL(mockWatchee[i], dueIn(_)).WillRepeatedly(Return(ms(20000)));
+    }
+
+    d.wd_interval = 10;
+    for (int k = 0; k != num_watchdog_device; k++) {
+        opaque[k].actual = &mockWatchee[k];
+        ctx[k].opaque = &opaque[k];
+    }
+
+    for (int k = 0; k != num_watchdog_device_half; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k], &d));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    for (int k = 0; k != num_watchdog_device_half; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k + num_watchdog_device_half], &d));
+        std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k]));
+        std::this_thread::sleep_for(std::chrono::milliseconds(20));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    for (int k = 0; k != num_watchdog_device_half; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k + num_watchdog_device_half]));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+}
+
+TEST_F(MVNCWatchdogTests, stressWatchDog2) {
+
+    const int num_watchdog_device = 30;
+    const int num_watchdog_device_half1 = num_watchdog_device / 3;
+    const int num_watchdog_device_half2 = 2 * num_watchdog_device / 3;
+
+    watchdog_init_context(nullptr);
+
+    StrictMock<MockWatchdogDevice> mockWatchee[num_watchdog_device];
+    int handle[num_watchdog_device];
+    wd_context ctx[num_watchdog_device];
+    wd_context_opaque_private opaque[num_watchdog_device];
+
+    for (int i = 0; i != num_watchdog_device; i++) {
+        handle[i] = i;
+
+        EXPECT_CALL(mockWatchee[i], getHandle()).WillRepeatedly(Return(handle + i));
+
+        // since interval is big keepAlive happens only once
+        if (i >= num_watchdog_device_half2) {
+            EXPECT_CALL(mockWatchee[i], keepAlive(_)).Times(AtLeast(0));
+        } else {
+            EXPECT_CALL(mockWatchee[i], keepAlive(_)).Times(1);
+        }
+
+        EXPECT_CALL(mockWatchee[i], setInterval(ms(10))).Times(1);
+        EXPECT_CALL(mockWatchee[i], isTimeout()).WillRepeatedly(Return(false));
+        EXPECT_CALL(mockWatchee[i], dueIn(_)).WillRepeatedly(Return(ms(20000)));
+    }
+
+    d.wd_interval = 10;
+    for (int k = 0; k != num_watchdog_device; k++) {
+        opaque[k].actual = &mockWatchee[k];
+        ctx[k].opaque = &opaque[k];
+    }
+
+    for (int k = 0; k != num_watchdog_device_half1; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k], &d));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    for (int k = 0; k != num_watchdog_device_half1; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k]));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    for (int k = num_watchdog_device_half1; k != num_watchdog_device_half2; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k], &d));
+        //this might lead to UB, for example thread might restart but after that device get removed, so giving more time
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k]));
+    }
+
+    for (int k = num_watchdog_device_half2; k != num_watchdog_device; k++) {
+        ASSERT_EQ(WD_ERRNO, watchdog_register_device(&ctx[k], &d));
+        //this might lead to UB, for example thread might restart but after that device get removed, so giving more time
+        //so our expectations for number of calls are not set for last third
+        ASSERT_EQ(WD_ERRNO, watchdog_unregister_device(&ctx[k]));
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
+}
diff --git a/inference-engine/tests/unit/engines/vpu/range_tests.cpp b/inference-engine/tests/unit/engines/vpu/range_tests.cpp
new file mode 100644 (file)
index 0000000..c1c8f5a
--- /dev/null
@@ -0,0 +1,394 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <set>
+#include <list>
+#include <array>
+
+#include <gtest/gtest.h>
+
+#include <vpu/utils/range.hpp>
+#include <vpu/utils/containers.hpp>
+
+using namespace testing;
+
+//
+// VPU_IterRangeTest
+//
+
+class VPU_IterRangeTest: public ::testing::Test {
+protected:
+    const int count = 10;
+    std::list<int> list;
+
+    void SetUp() override {
+        for (int i = 0; i < count; ++i) {
+            list.push_back(i);
+        }
+    }
+};
+
+TEST_F(VPU_IterRangeTest, PreservesIterationOrder) {
+    auto contRange = vpu::contRange(list);
+
+    int gold = 0;
+    auto innerIt = list.cbegin();
+    for (auto cit = contRange.cbegin(); cit != contRange.end(); cit++) {
+        ASSERT_EQ(*cit, *innerIt++) << "Values given by owner and inner containers differ";
+        gold++;
+    }
+    ASSERT_EQ(gold, count) << "Owner and inner ranges differ in length";
+}
+
+TEST_F(VPU_IterRangeTest, RespectsInnerPushBacksWhileIteration) {
+    auto contRange = vpu::contRange(list);
+
+    int gold = 0;
+    auto innerIt = list.begin();
+
+    for (auto val : contRange) {
+        if (gold < 5) {
+            // duplicate first 5 elements of the head, inserting them after the tail
+            list.push_back(*innerIt);
+        }
+        ASSERT_EQ(val, *innerIt++) << "Values given by owner and inner containers differ";
+        gold++;
+    }
+
+    ASSERT_EQ(gold, count + 5) << "Initial inner container size was not preserved";
+}
+
+TEST_F(VPU_IterRangeTest, RespectsInnerRemovalsWhileIteration) {
+    auto contRange = vpu::contRange(list);
+
+    int gold = 0;
+    auto innerIt = list.begin();
+    auto innerRevIt = list.end();
+
+    for (auto val : contRange) {
+        if (gold < 5) {
+            // removing elements from the end
+            innerRevIt = list.erase(--innerRevIt);
+        }
+        ASSERT_EQ(val, *innerIt++) << "Values given by owner and inner containers differ";
+        gold++;
+    }
+
+    ASSERT_EQ(gold, count - 5) << "Removals were ignored";
+}
+
+TEST_F(VPU_IterRangeTest, SurvivesInnerInsertionsWhileIteration) {
+    auto contRange = vpu::contRange(list);
+
+    int gold = 0;
+    auto innerIt = list.begin();
+
+    for (auto it = contRange.begin(); it != contRange.end(); ++it) {
+        ASSERT_EQ(*it, *innerIt) << "Values given by owner and inner containers differ";
+
+        if (gold < 10) {
+            // duplicate head elements of inner, inserting them just before the current iterator
+            list.insert(innerIt, *innerIt);
+        }
+        gold++;
+        innerIt++;
+    }
+
+    ASSERT_EQ(gold, count) << "Insertions at the head influenced iteration";
+}
+
+//
+// VPU_MapRangeTest
+//
+
+class VPU_MapRangeTest: public ::testing::Test {
+protected:
+    struct InnerStruct final : public vpu::EnableHandleFromThis<InnerStruct> {
+        int val = 0;
+        vpu::IntrusivePtrListNode<InnerStruct> node;
+        explicit InnerStruct(int val) : val(val), node(this) {}
+    };
+
+    const int count = 10;
+    std::list<int> list;
+    std::vector<int> vec;
+
+    const static std::function<int(int)> incFunc;
+    const static std::function<double(int)> incAndConvertFunc;
+
+    void SetUp() override {
+        for (int i = 0; i < count; ++i) {
+            list.push_back(i);
+            vec.push_back(i);
+        }
+    }
+};
+
+const std::function<int(int)> VPU_MapRangeTest::incFunc = [](int val) { return val + 1; };
+
+const std::function<double(int)> VPU_MapRangeTest::incAndConvertFunc = [](int val)
+        { return static_cast<double>(val + 1); };
+
+TEST_F(VPU_MapRangeTest, PreservesIterationOrder) {
+    auto mapRange = vpu::mapRange(vpu::contRange(vec), incFunc);
+
+    int gold = 0;
+    auto innerIt = vec.cbegin();
+    for (auto cit = mapRange.cbegin(); cit != mapRange.end(); ++cit) {
+        int mappedExpectation = incFunc(*innerIt);
+        ASSERT_EQ(*cit, mappedExpectation) << "Values given by map and inner containers differ";
+        gold++;
+        innerIt++;
+    }
+    ASSERT_EQ(gold, count) << "Owner and inner ranges differ in length";
+}
+
+TEST_F(VPU_MapRangeTest, MapToAnotherType) {
+    auto mapRange = vpu::mapRange(vpu::contRange(vec), incAndConvertFunc);
+
+    int gold = 0;
+    auto innerIt = vec.cbegin();
+    for (auto cit = mapRange.cbegin(); cit != mapRange.end(); ++cit) {
+        const int base = *innerIt;
+        const double mappedExpectation = incAndConvertFunc(base);
+        ASSERT_EQ(*cit, mappedExpectation) << "Values given by map and inner containers differ";
+        gold++;
+        innerIt++;
+    }
+    ASSERT_EQ(gold, count) << "Owner and inner ranges differ in length";
+}
+
+TEST_F(VPU_MapRangeTest, CountSharedPointers) {
+    std::vector<std::shared_ptr<InnerStruct>> nodesExternalVector;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto innerStructSPtr = std::make_shared<InnerStruct>(i);
+        ASSERT_EQ(1, innerStructSPtr.use_count()) << "single instance of shared pointer ";
+        nodesExternalVector.push_back(innerStructSPtr);
+        ASSERT_EQ(2, innerStructSPtr.use_count()) << "stack instance of shared pointer plus copy in vector";
+        list.push_back(innerStructSPtr);
+        ASSERT_EQ(2, innerStructSPtr.use_count()) << "intrusive list keeps weak pointer only";
+    }
+
+    auto mapRange = vpu::mapRange(
+            vpu::contRange(list),
+            [](const vpu::Handle<InnerStruct>& innerPtr) {
+                return incFunc(innerPtr->val);
+            });
+
+    for (int i = 0; i < count; ++i) {
+        ASSERT_EQ(1, nodesExternalVector[i].use_count()) << "intrusive list keeps weak pointer only";
+    }
+}
+
+TEST_F(VPU_MapRangeTest, IterationOverIntrusiveListSurvivesElementRemoval) {
+    std::vector<std::shared_ptr<InnerStruct>> nodesExternalVector;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto innerStructSPtr = std::make_shared<InnerStruct>(i);
+        nodesExternalVector.push_back(innerStructSPtr);
+        list.push_back(innerStructSPtr);
+    }
+
+    auto mapRange = vpu::mapRange(
+        vpu::contRange(list),
+        [](const vpu::Handle<InnerStruct>& innerPtr) {
+            return incFunc(innerPtr->val);
+        });
+
+    int i = 0;
+    for (auto mprit = mapRange.cbegin(); mprit != mapRange.cend(); ++mprit, ++i) {
+        ASSERT_EQ(2, nodesExternalVector[i].use_count()) << "intrusive list's iterator keeps shared pointer too";
+        ASSERT_EQ(*mprit, incFunc(i)) << "mapped value must conform to increment function";
+
+        list.pop_front();
+
+        ASSERT_EQ(1, nodesExternalVector[i].use_count()) << "removal of element releases shared pointer of its iterator";
+    }
+}
+
+//
+// VPU_FilterRangeTest
+//
+
+class VPU_FilterRangeTest: public ::testing::Test {
+protected:
+    struct InnerStruct final : public vpu::EnableHandleFromThis<InnerStruct> {
+        int val = 0;
+        vpu::IntrusivePtrListNode<InnerStruct> node;
+        explicit InnerStruct(int val) : val(val), node(this) {}
+    };
+
+    const int count = 10;
+    std::vector<int> vec;
+
+    const static std::function<bool(int)> evenFunc;
+
+    virtual void SetUp() override {
+        for (int i = 0; i < count; ++i) {
+            vec.push_back(i);
+        }
+    }
+};
+
+const std::function<bool(int)> VPU_FilterRangeTest::evenFunc = [](int val) { return val % 2 == 0; };
+
+TEST_F(VPU_FilterRangeTest, FilteringOnlyEvenNumbers) {
+    auto filterRange = vpu::filterRange(vpu::contRange(vec), evenFunc);
+
+    int i = 0;
+    for (auto val : filterRange) {
+        ASSERT_EQ(val, i);
+        i += 2;
+    }
+    ASSERT_EQ(i, count);
+}
+
+TEST_F(VPU_FilterRangeTest, FilteringOutFirst) {
+    auto filterRange = vpu::filterRange(
+        vpu::contRange(vec),
+        [](int val) {
+            return val != 0;
+        });
+
+    int gold = 1;
+    for (auto val : filterRange) {
+        ASSERT_EQ(val, gold);
+        gold++;
+    }
+    ASSERT_EQ(gold, count);
+}
+
+TEST_F(VPU_FilterRangeTest, FilteringOutLast) {
+    auto filterRange = vpu::filterRange(
+        vpu::contRange(vec),
+        [&](int val) {
+            return val != count - 1;
+        });
+
+    int gold = 0;
+    for (auto val : filterRange) {
+        ASSERT_EQ(val, gold);
+        gold++;
+    }
+    ASSERT_EQ(gold, count - 1);
+}
+
+TEST_F(VPU_FilterRangeTest, CountSharedPointers) {
+    std::list<std::shared_ptr<InnerStruct>> nodesExternalList;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto inner = std::make_shared<InnerStruct>(i);
+        ASSERT_EQ(1, inner.use_count()) << "single instance of shared pointer";
+        nodesExternalList.push_back(inner);
+        ASSERT_EQ(2, inner.use_count()) << "stack instance of shared pointer plus copy in vector";
+        list.push_back(inner);
+        ASSERT_EQ(2, inner.use_count()) << "intrusive list keeps weak pointer only";
+    }
+
+    for (auto cit = nodesExternalList.cbegin(); cit != nodesExternalList.end(); ++cit) {
+        ASSERT_EQ(1, cit->use_count()) << "intrusive list keeps weak pointer only";
+    }
+
+    auto filterRange = vpu::filterRange<vpu::NonNull>(vpu::contRange(list));
+
+    for (auto cit = nodesExternalList.cbegin(); cit != nodesExternalList.end(); ++cit) {
+        ASSERT_EQ(1, cit->use_count()) << "intrusive list keeps weak pointer only";
+    }
+}
+
+TEST_F(VPU_FilterRangeTest, IterationOverIntrusiveListSurvivesElementRemoval) {
+    std::list<std::shared_ptr<InnerStruct>> nodesExternalList;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto inner = std::make_shared<InnerStruct>(i);
+        nodesExternalList.push_back(inner);
+        list.push_back(inner);
+    }
+
+    auto filterRange = vpu::filterRange<vpu::NonNull>(vpu::contRange(list));
+
+    int gold = 0;
+    for (const auto& ptr : filterRange) {
+        ASSERT_EQ(ptr->val, gold);
+        list.pop_front();
+        gold++;
+    }
+    ASSERT_EQ(gold, count);
+
+    for (auto cit = nodesExternalList.cbegin(); cit != nodesExternalList.end(); ++cit) {
+        ASSERT_EQ(1, cit->use_count()) << "intrusive list keeps weak pointer only";
+    }
+}
+
+TEST_F(VPU_FilterRangeTest, IterationOverIntrusiveListWhileElementsBeingRemoved) {
+    std::list<std::shared_ptr<InnerStruct>> nodesExternalList;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto inner = std::make_shared<InnerStruct>(i);
+        nodesExternalList.push_back(inner);
+        list.push_back(inner);
+    }
+
+    auto filterRange = vpu::filterRange(
+        vpu::contRange(list),
+        [](const vpu::Handle<InnerStruct>& innerPtr) {
+            return evenFunc(innerPtr->val);
+        });
+
+    int gold = 0;
+    for (const auto& ptr : filterRange) {
+        ASSERT_EQ(ptr->val, gold);
+        // remove even & odd front elems
+        list.pop_front();
+        list.pop_front();
+        gold += 2;
+    }
+    ASSERT_EQ(gold, count);
+
+    for (auto cit = nodesExternalList.cbegin(); cit != nodesExternalList.end(); ++cit) {
+        ASSERT_EQ(1, cit->use_count()) << "intrusive list keeps weak pointer only";
+    }
+}
+
+TEST_F(VPU_FilterRangeTest, IterationOverEmptyIntrusiveListWhereAllElementsFilteredOut) {
+    std::list<std::shared_ptr<InnerStruct>> nodesExternalList;
+    vpu::IntrusivePtrList<InnerStruct> list(&InnerStruct::node);
+
+    for (int i = 0; i < count; ++i) {
+        auto inner = std::make_shared<InnerStruct>(i);
+        nodesExternalList.push_back(inner);
+        list.push_back(inner);
+    }
+
+    auto filterRange = vpu::filterRange(
+        vpu::contRange(list),
+        [](const vpu::Handle<InnerStruct>& innerPtr) {
+            return (innerPtr->val < 0);
+        });
+
+    for (const auto& ptr : filterRange) {
+        ASSERT_TRUE(false) << "Must not see any item in filtered list";
+    }
+
+    for (auto cit = list.cbegin(); cit != list.cend(); ++cit) {
+        if (evenFunc((*cit)->val)) {
+            list.erase(cit);
+        }
+    }
+
+    for (const auto& ptr : filterRange) {
+        ASSERT_TRUE(false) << "Must not see any item in filtered list";
+    }
+
+    for (auto cit = nodesExternalList.cbegin(); cit != nodesExternalList.end(); ++cit) {
+        ASSERT_EQ(1, cit->use_count()) << "intrusive list keeps weak pointer only";
+    }
+}
diff --git a/inference-engine/tests/unit/engines/vpu/replace_deconv_by_conv_tests.cpp b/inference-engine/tests/unit/engines/vpu/replace_deconv_by_conv_tests.cpp
new file mode 100644 (file)
index 0000000..26b4394
--- /dev/null
@@ -0,0 +1,154 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/hw/mx_stage.hpp>
+#include <vpu/hw/utility.hpp>
+
+#include "graph_transformer_tests.hpp"
+using namespace InferenceEngine;
+class VPU_ReplaceDeconvByConvTest : public VPU_GraphTransformerTest {
+ protected:
+    vpu::PassSet pipeline;
+    vpu::Model::Ptr model;
+
+
+ public:
+    void InitDeconvStage(
+        int kernelx,
+        int kernely,
+        int inputX=16,
+        int inputY=16,
+        bool onlySwConvAdaptation = false,
+        bool isOutput4D = true) {
+
+        int kernelStrideX = 1;
+        int kernelStrideY = 1;
+        int dilationX = 1;
+        int dilationY = 1;
+        model = CreateModel();
+
+        auto input = model->addInputData(
+            "Input",
+            vpu::DataDesc(vpu::DataType::FP16, vpu::DimsOrder::NCHW, {inputX, inputY, 2, 1}));
+        model->attrs().set<int>("numInputs", 1);
+
+        vpu::Data output;
+        if (isOutput4D) {
+            output = model->addOutputData(
+                "Output",
+                vpu::DataDesc(vpu::DataType::FP16,
+                              vpu::DimsOrder::NCHW,
+                              {kernelx + (inputX - 1) * kernelStrideX, kernely + (inputY - 1) * kernelStrideY, 2, 1}));
+        } else {
+            output = model->addOutputData(
+                "Output",
+                vpu::DataDesc(vpu::DataType::FP16,
+                              vpu::DimsOrder::CHW,
+                              {kernelx + (inputX - 1) * kernelStrideX, kernely + (inputY - 1) * kernelStrideY, 2}));
+        }
+
+        auto deconv = std::make_shared<DeconvolutionLayer>(LayerParams{"deconv", "Deconvolution", Precision::FP16});
+        deconv->_kernel_x = kernelx;
+        deconv->_kernel_y = kernely;
+        deconv->_stride_x = kernelStrideX;
+        deconv->_stride_y = kernelStrideY;
+        deconv->_dilation_x = dilationX;
+        deconv->_dilation_x = dilationY;
+
+        deconv->_weights = make_shared_blob<short>(Precision::FP16, Layout::C, {static_cast<size_t>(kernelx * kernely * 2 * 2)});
+        deconv->_weights->allocate();
+
+        frontEnd->parseDeconvolution(model, deconv, {input}, {output});
+
+        pipeline.addPass(passManager->dumpModel("initial"));
+
+        // if deconv converted to conv than swConvAdaptaion will work - if not will got an exception
+        pipeline.addPass(passManager->replaceDeconvByConv());
+        pipeline.addPass(passManager->dumpModel("replaceDeconvByConv"));
+
+        pipeline.addPass(passManager->swConvAdaptation());
+        pipeline.addPass(passManager->dumpModel("swConvAdaptation"));
+
+        if (!onlySwConvAdaptation) {
+            pipeline.addPass(passManager->adjustDataLayout());
+            pipeline.addPass(passManager->dumpModel("adjustDataLayout"));
+
+            pipeline.addPass(passManager->processSpecialStages());
+            pipeline.addPass(passManager->dumpModel("processSpecialStages"));
+
+            pipeline.addPass(passManager->adjustDataLocation());
+            pipeline.addPass(passManager->dumpModel("adjustDataLocation"));
+
+            pipeline.addPass(passManager->finalCheck());
+        }
+    }
+};
+
+TEST_F(VPU_ReplaceDeconvByConvTest, deconvReplacedByConvIfKernelSizeFitsHWUnit) {
+    InitCompileEnv();
+    InitDeconvStage(15, 15);
+
+    ASSERT_NO_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, deconvCannotBeReplacedByConvIfDisabledInConfig) {
+    config.hwBlackList="deconv";
+    InitCompileEnv();
+    InitDeconvStage(16, 15);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, deconvCannotBeReplacedByConvIfKernelSizeXToBig) {
+    InitCompileEnv();
+    InitDeconvStage(16, 15);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, deconvCannotBeReplacedByConvIfKernelSizeYToBig) {
+    InitCompileEnv();
+    InitDeconvStage(15, 16);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, deconvCannotBeReplacedByConvIfOutputNot4D) {
+    InitCompileEnv();
+    InitDeconvStage(15, 15, 16, 16, false, false);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, canDetectIm2CollBufferFitToBSS) {
+    InitCompileEnv();
+    // im2col buffer here will be 200 Mbytes, so exception gets thrown , but not in im2coll verification, as shown in next test
+    InitDeconvStage(15, 15, 16, 20000);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, canNotDetectIm2CollBufferOverFlow) {
+    InitCompileEnv();
+    // remaining only sw conv adaptation - since big output might not fit shaves / cmx memory, but we need an im2coll error here
+    InitDeconvStage(15, 15, 16, 20000, true);
+
+    ASSERT_NO_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, canDetectIm2CollBufferOverFlow) {
+    InitCompileEnv();
+    // remaining only sw conv adaptation - since big output might not fit shaves / cmx memory, but we need an im2coll error here
+    InitDeconvStage(15, 15, 5016, 20000, true);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
+
+TEST_F(VPU_ReplaceDeconvByConvTest, canDetectCMXOrShavesMemoryLimit) {
+    InitCompileEnv();
+    // remaining only sw conv adaptation - since big output might not fit shaves / cmx memory, but we need an im2coll error here
+    InitDeconvStage(15, 15, 16, 20000);
+
+    ASSERT_ANY_THROW(pipeline.run(model));
+}
index 368e8fd..5b48fb4 100644 (file)
@@ -61,8 +61,7 @@ TEST_F(DeviceTests, returnsProperDeviceName) {
     ASSERT_STREQ(getDeviceName(TargetDevice::eMYRIAD), "MYRIAD");
     ASSERT_STREQ(getDeviceName(TargetDevice::eGNA), "GNA");
     ASSERT_STREQ(getDeviceName(TargetDevice::eHETERO), "HETERO");
-    ASSERT_STREQ(getDeviceName(TargetDevice::eKMB), "KMB");
     ASSERT_STREQ(getDeviceName(static_cast<TargetDevice>(-1)), "Unknown device");
     //off by one test - might not be enough
-    ASSERT_STREQ(getDeviceName(static_cast<TargetDevice>((uint8_t)TargetDevice::eKMB + 1)), "Unknown device");
+    ASSERT_STREQ(getDeviceName(static_cast<TargetDevice>((uint8_t)TargetDevice::eHETERO + 1)), "Unknown device");
 }
index 673d5c7..16b64c2 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index b2d2671..fa338e2 100644 (file)
@@ -2,14 +2,26 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#if defined _WIN32
+// Avoidance of Windows.h to include winsock library.
+#define _WINSOCKAPI_
+// Avoidance of Windows.h to define min/max.
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif  // _WIN32
+
 #include "tests_common.hpp"
 #include "mock_plugin_dispatcher.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
+#include <string>
 #include "ie_plugin_dispatcher.hpp"
 #include "ie_plugin_ptr.hpp"
 #include "ie_device.hpp"
 
+
 using namespace InferenceEngine;
 using namespace ::testing;
 
@@ -23,6 +35,61 @@ TEST_F(PluginDispatcherTests, canLoadMockPlugin) {
     ASSERT_NO_THROW(dispatcher.getPluginByName(nameExt("mock_engine")));
 }
 
+#if defined _WIN32
+
+class SetDllDirectoryCaller {
+public:
+    /// Call SetDllDirectory if not called before
+    SetDllDirectoryCaller(const char* path) {
+        // Check if user already called SetDllDirectory with acctual directory
+        call_setdlldirectory = (1 >= GetDllDirectory(0, nullptr));
+        if (call_setdlldirectory) {
+            SetDllDirectory(path);
+        }
+    }
+    /// Restore serch path order to default
+    ~SetDllDirectoryCaller() {
+        if (call_setdlldirectory)
+            SetDllDirectory(nullptr);
+    }
+
+    bool call_setdlldirectory;
+
+    // Non copyable or movable
+    SetDllDirectoryCaller(const SetDllDirectoryCaller&) = delete;
+    SetDllDirectoryCaller& operator=(const SetDllDirectoryCaller&) = delete;
+};
+
+TEST_F(PluginDispatcherTests, canLoadMockPluginAndRetainSetDllDirectory) {
+    // a test pre-requisite that SetDllDirectory is not configured
+    ASSERT_EQ(1, GetDllDirectory(0, nullptr));
+
+    // try modify DLL search order with SetDllDirectory
+    const char *set_dir = "12345";
+    char get_dir[6] = {0};
+    SetDllDirectoryCaller set_dll_directory_caller(set_dir);
+
+    PluginDispatcher dispatcher({ "", "./", "./lib" });
+    ASSERT_NO_THROW(dispatcher.getPluginByName(nameExt("mock_engine")));
+
+    // verify DLL search order retained
+    ASSERT_EQ(sizeof(get_dir), GetDllDirectory(0, nullptr));
+    ASSERT_NE(0, GetDllDirectory(sizeof(get_dir), get_dir));
+    ASSERT_EQ(std::string(get_dir), std::string(set_dir));
+}
+
+TEST_F(PluginDispatcherTests, canLoadMockPluginAndKeepDefaultDLLSearch) {
+    // a test pre-requisite that SetDllDirectory is not configured
+    ASSERT_EQ(1, GetDllDirectory(0, nullptr));
+
+    PluginDispatcher dispatcher({ "", "./", "./lib" });
+    ASSERT_NO_THROW(dispatcher.getPluginByName(nameExt("mock_engine")));
+
+    // verify DLL search order is still default
+    ASSERT_EQ(1, GetDllDirectory(0, nullptr));
+}
+#endif
+
 TEST_F(PluginDispatcherTests, throwsOnUnknownPlugin) {
     PluginDispatcher dispatcher({ "./", "./lib" });
     ASSERT_THROW(dispatcher.getPluginByName(nameExt("unknown_plugin")), InferenceEngine::details::InferenceEngineException);
index 5a4248f..16b6c55 100644 (file)
@@ -1,20 +1,8 @@
-#
-# Copyright (C) 2018-2019 Intel Corporation.
-#
-# This software and the related documents are Intel copyrighted materials,
-# and your use of them is governed by the express license under which they
-# were provided to you (End User License Agreement for the Intel(R) Software
-# Development Products (Version May 2017)). Unless the License provides
-# otherwise, you may not use, modify, copy, publish, distribute, disclose or
-# transmit this software or the related documents without Intel's prior
-# written permission.
-#
-# This software and the related documents are provided as is, with no
-# express or implied warranties, other than those that are expressly
-# stated in the License.
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 #
 
-if(NOT ENABLE_GAPI_TESTS)
+if(NOT ENABLE_GAPI_TESTS OR WIN32)
     message(WARNING "Skipping GAPI unit tests")
     return()
 endif()
@@ -25,6 +13,8 @@ if(NOT OpenCV_FOUND)
     return()
 endif()
 
+include_directories(${IE_MAIN_SOURCE_DIR}/src/inference_engine)
+
 add_subdirectory(fluid_test_computations)
 
 file(GLOB SOURCES *.cpp common/*.cpp cpu/*.cpp)
index 5ade83a..998171a 100644 (file)
@@ -1,17 +1,5 @@
-#
-# Copyright 2019 Intel Corporation.
-#
-# This software and the related documents are Intel copyrighted materials,
-# and your use of them is governed by the express license under which they
-# were provided to you (End User License Agreement for the Intel(R) Software
-# Development Products (Version May 2017)). Unless the License provides
-# otherwise, you may not use, modify, copy, publish, distribute, disclose or
-# transmit this software or the related documents without Intel's prior
-# written permission.
-#
-# This software and the related documents are provided as is, with no
-# express or implied warranties, other than those that are expressly
-# stated in the License.
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 #
 
 file(GLOB SRC *.cpp)
index 9efd2ee..17845d8 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
 #include <fluid_test_computations.hpp>
 #include <opencv2/gapi.hpp>
 #include <ie_preprocess_gapi_kernels.hpp>
diff --git a/inference-engine/tests/unit/samples/config_register.cpp b/inference-engine/tests/unit/samples/config_register.cpp
new file mode 100644 (file)
index 0000000..44ccce4
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <gflags/gflags.h>
+#include <Configuration.h>
+#include <args_parser.h>
+
+class RegisterConfigTests : public ::testing::Test {
+protected:
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+    }
+
+public:
+    RegisterConfigTests() : Test() {
+    }
+
+    virtual ~RegisterConfigTests() {
+    }
+};
+
+REGISTER_STRING_PARAM(model, m, "!", "someModelMessage", MODEL);
+REGISTER_STRING_PARAM(plugin, p, "", "somePluginMessage", PLUGIN);
+REGISTER_UINT32_PARAM(niter, ni, 3, "someIterNumMessage", ITER_NUM);
+REGISTER_BOOL_PARAM(perf_count, pc, false, "somePerfCountMessage", PERF_COUNT);
+
+TEST_F(RegisterConfigTests, canRegisterParams) {
+    Configuration config;
+    RegisterConfig::RegisterConfigBinding::deploy(config);
+    ASSERT_STREQ(config.value<MODEL>().c_str(), "!");
+    ASSERT_STREQ(config.value<PLUGIN>().c_str(), "");
+    ASSERT_EQ(config.value<ITER_NUM>(), 3);
+    ASSERT_FALSE(config.value<PERF_COUNT>());
+}
+
diff --git a/inference-engine/tests/unit/samples/samples_core.cpp b/inference-engine/tests/unit/samples/samples_core.cpp
new file mode 100644 (file)
index 0000000..b770438
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "../../tests/performance/core/Configuration.h"
+
+class SampleCoreTests : public ::testing::Test {
+protected:
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+    }
+
+public:
+    SampleCoreTests() : Test() {
+    }
+
+    virtual ~SampleCoreTests() {
+    }
+};
+
+
+TEST_F(SampleCoreTests, canSetValue) {
+    Configuration config;
+    config.setValue<MODEL>("Model");
+    ASSERT_STREQ(config.value<MODEL>().c_str(), "Model");
+    ASSERT_FALSE(config.value<MODEL>().empty());
+}
+
+TEST_F(SampleCoreTests, canAddImages) {
+    Configuration config;
+    std::vector<std::string>& images = config.value<IMAGES>();
+    images.push_back("smth");
+
+    ASSERT_FALSE(config.value<IMAGES>().empty());
+}
+
+TEST_F(SampleCoreTests, canCreateConstConfig) {
+    Configuration config;
+    config.value<PLUGIN_PATHS>() = { "plugin_paths" };
+    config.value<MODEL>() = "Model";
+    config.value<PLUGIN>() = "Plugin";
+    std::string path = config.value<PLUGIN_PATHS>().at(0);
+
+    const Configuration config2 = config;
+
+    ASSERT_FALSE(config2.value<PLUGIN_PATHS>().empty());
+    ASSERT_FALSE(config.value<MODEL>().empty());
+    ASSERT_STREQ(config.value<PLUGIN>().c_str(), "Plugin");
+}
index 44478ad..8db725b 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
 #include <utility>
 
 #include <utility>
index 451799c..14785a6 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
 #include <utility>
 
 #include <utility>
index 83f48ef..23f27c0 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 9e2f935..85b9b4a 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 04be08c..8b5bc9a 100644 (file)
@@ -1,17 +1,5 @@
-#
-# Copyright (C) 2018-2019 Intel Corporation.
-#
-# This software and the related documents are Intel copyrighted materials,
-# and your use of them is governed by the express license under which they
-# were provided to you (End User License Agreement for the Intel(R) Software
-# Development Products (Version May 2017)). Unless the License provides
-# otherwise, you may not use, modify, copy, publish, distribute, disclose or
-# transmit this software or the related documents without Intel's prior
-# written permission.
-#
-# This software and the related documents are provided as is, with no
-# express or implied warranties, other than those that are expressly
-# stated in the License.
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 #
 
 set (TARGET_NAME "test_validation_app")
index 9695a69..00ae661 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/bf8_xy16.jpg and b/inference-engine/thirdparty/clDNN/docs/img/bf8_xy16.jpg differ
index 45412f6..28c8d72 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/bfyx.jpg and b/inference-engine/thirdparty/clDNN/docs/img/bfyx.jpg differ
index 2551b7b..d40dad9 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/bs_x_bsv16.jpg and b/inference-engine/thirdparty/clDNN/docs/img/bs_x_bsv16.jpg differ
index 7312670..95cb07c 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv16.jpg and b/inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv16.jpg differ
index 93f0ec1..72c2d17 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv8.jpg and b/inference-engine/thirdparty/clDNN/docs/img/bs_xs_xsv8_bsv8.jpg differ
index 5398b2f..b6882ed 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/byxf.jpg and b/inference-engine/thirdparty/clDNN/docs/img/byxf.jpg differ
index 27a87c1..7a97c6e 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/crop_no_offset.jpg and b/inference-engine/thirdparty/clDNN/docs/img/crop_no_offset.jpg differ
index de87b8a..a064a66 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/crop_w_offset.jpg and b/inference-engine/thirdparty/clDNN/docs/img/crop_w_offset.jpg differ
index ab70cec..71a2862 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c1_b_fyx.jpg and b/inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c1_b_fyx.jpg differ
index 537508c..c9def19 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c4_fyx_b.jpg and b/inference-engine/thirdparty/clDNN/docs/img/image_2d_weights_c4_fyx_b.jpg differ
index 3e3ab8b..633b1f8 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/layout_memory_representation.jpg and b/inference-engine/thirdparty/clDNN/docs/img/layout_memory_representation.jpg differ
index 052872c..cbbea2a 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/os_iyx_osv16.jpg and b/inference-engine/thirdparty/clDNN/docs/img/os_iyx_osv16.jpg differ
index ba67641..ae1d393 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/workflow.jpg and b/inference-engine/thirdparty/clDNN/docs/img/workflow.jpg differ
index 601bf08..80559e5 100644 (file)
Binary files a/inference-engine/thirdparty/clDNN/docs/img/yxfb.jpg and b/inference-engine/thirdparty/clDNN/docs/img/yxfb.jpg differ
index 0b9a480..a5b601f 100644 (file)
@@ -1,24 +1,5 @@
 #!/usr/bin/env python2
 
-# INTEL CONFIDENTIAL
-# Copyright 2016 Intel Corporation
-#
-# The source code contained or described herein and all documents related to the source code ("Material") are owned by
-# Intel Corporation or its suppliers or licensors. Title to the Material remains with Intel Corporation or its
-# suppliers and licensors. The Material contains trade secrets and proprietary and confidential information of Intel
-# or its suppliers and licensors. The Material is protected by worldwide copyright and trade secret laws and treaty
-# provisions. No part of the Material may be used, copied, reproduced, modified, published, uploaded, posted,
-# transmitted, distributed, or disclosed in any way without Intel's prior express written permission.
-#
-# No license under any patent, copyright, trade secret or other intellectual property right is granted to
-# or conferred upon you by disclosure or delivery of the Materials, either expressly, by implication, inducement,
-# estoppel or otherwise. Any license under such intellectual property rights must be express and approved by Intel
-# in writing.
-#
-#
-# For details about script please contact following people:
-#  * [Version: 1.0] Walkowiak, Marcin <marcin.walkowiak@intel.com>
-
 import argparse
 import os
 
index a0a2d1a..d1baf95 100644 (file)
@@ -505,14 +505,14 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_dw_conv(
  * where binarization_op is configured with given parameters.
  */
 mkldnn_status_t MKLDNN_API mkldnn_post_ops_append_binarization(
-        mkldnn_post_ops_t post_ops, mkldnn_alg_kind_t alg, const float* weights_data);
+        mkldnn_post_ops_t post_ops, mkldnn_alg_kind_t alg, const float* weights_data, const float* output_mask);
 
 /** Gets the binarization parameters of the post operation with index @p index in
  * the sequence of @p post_ops.
  */
 mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_binarization(
         const_mkldnn_post_ops_t post_ops, int index,
-        mkldnn_alg_kind_t *alg, const float** weights_data);
+        mkldnn_alg_kind_t *alg, const float** weights_data, const float** output_mask);
 
 /** @} */
 
@@ -1821,7 +1821,8 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_binary_convolution_forward_desc_init(
 mkldnn_status_t MKLDNN_API mkldnn_binarization_forward_desc_init(
         mkldnn_binarization_desc_t *binarization_desc, mkldnn_prop_kind_t prop_kind,
         mkldnn_alg_kind_t alg_kind, const mkldnn_memory_desc_t *src_desc,
-        const mkldnn_memory_desc_t *dst_desc, const mkldnn_memory_desc_t *weights_desc);
+        const mkldnn_memory_desc_t *dst_desc, const mkldnn_memory_desc_t *weights_desc,
+        const mkldnn_memory_desc_t *output_mask_desc);
 
 /** @} */
 
index 2ce46c9..683c62e 100644 (file)
@@ -290,7 +290,7 @@ enum algorithm {
     roi_pooling_max = mkldnn_roi_pooling_max,
     roi_pooling_bilinear = mkldnn_roi_pooling_bilinear,
     binary_convolution_direct = mkldnn_binary_convolution_direct,
-    binarization_depthwise = mkldnn_binarization_depthwise
+    binarization_depthwise = mkldnn_binarization_depthwise,
 };
 
 inline mkldnn_alg_kind_t convert_to_c(algorithm aalgorithm) {
@@ -453,14 +453,14 @@ struct post_ops: public handle<mkldnn_post_ops_t> {
                           "could not get dw conv params");
     }
 
-    void append_binarization(algorithm alg, const float* weights_data) {
-        error::wrap_c_api(mkldnn_post_ops_append_binarization(get(), convert_to_c(alg), weights_data),
+    void append_binarization(algorithm alg, const float* weights_data, const float* output_mask) {
+        error::wrap_c_api(mkldnn_post_ops_append_binarization(get(), convert_to_c(alg), weights_data, output_mask),
                 "could not append binarization");
     }
 
-    void get_params_binarization(int index, algorithm &alg, const float** weights_data) const {
+    void get_params_binarization(int index, algorithm &alg, const float** weights_data, const float** output_mask) const {
         mkldnn_alg_kind_t c_alg;
-        error::wrap_c_api(mkldnn_post_ops_get_params_binarization(get(), index, &c_alg, weights_data),
+        error::wrap_c_api(mkldnn_post_ops_get_params_binarization(get(), index, &c_alg, weights_data, output_mask),
                 "could not get binarization params");
         alg = static_cast<algorithm>(c_alg);
     }
@@ -3523,12 +3523,13 @@ struct binarization_forward : public primitive {
         mkldnn_binarization_desc_t data;
 
         desc(prop_kind aprop_kind, algorithm alg_kind,
-             const memory::desc &src_desc, const memory::desc &weights_desc, const memory::desc &dst_desc) {
+             const memory::desc &src_desc, const memory::desc &weights_desc, const memory::desc &output_mask_desc,
+             const memory::desc &dst_desc) {
             error::wrap_c_api(mkldnn_binarization_forward_desc_init(&data,
                                                                  mkldnn::convert_to_c(aprop_kind),
                                                                  mkldnn::convert_to_c(alg_kind),
                                                                  &src_desc.data, &dst_desc.data,
-                                                                 &weights_desc.data),
+                                                                 &weights_desc.data, &output_mask_desc.data),
                               "could not create a binarization forward descriptor");
         }
     };
@@ -3546,9 +3547,10 @@ struct binarization_forward : public primitive {
     };
 
     binarization_forward(const primitive_desc &aprimitive_desc,
-                      const primitive::at &src, const primitive::at &weights, const memory &dst) {
+                      const primitive::at &src, const primitive::at &weights, const primitive::at &output_mask,
+                      const memory &dst) {
         mkldnn_primitive_t result;
-        mkldnn_primitive_at_t inputs[] = { src.data, weights.data };
+        mkldnn_primitive_at_t inputs[] = { src.data, weights.data, output_mask.data};
         const_mkldnn_primitive_t outputs[] = { dst.get() };
         error::wrap_c_api(mkldnn_primitive_create(&result, aprimitive_desc.get(), inputs, outputs),
                           "could not create a binarization forward primitive");
index a86eb66..ca6c37d 100644 (file)
@@ -1159,6 +1159,8 @@ typedef struct {
     mkldnn_memory_desc_t dst_desc;
     /** Weights memory descriptor. */
     mkldnn_memory_desc_t weights_desc;
+    /** Weights memory descriptor. */
+    mkldnn_memory_desc_t output_mask_desc;
 } mkldnn_binarization_desc_t;
 
 /** @} */
index f6ab0c0..e03babd 100644 (file)
@@ -32,9 +32,9 @@ using namespace mkldnn::impl::types;
 namespace {
 status_t binarization_desc_init(binarization_desc_t *binarization_desc, prop_kind_t prop_kind,
         alg_kind_t alg_kind, const memory_desc_t *src_desc, const memory_desc_t *dst_desc,
-        const memory_desc_t *weights_desc) {
+        const memory_desc_t *weights_desc, const memory_desc_t *output_mask_desc) {
     bool args_ok = true
-        && !any_null(binarization_desc, src_desc, dst_desc, weights_desc)
+        && !any_null(binarization_desc, src_desc, dst_desc, weights_desc, output_mask_desc)
         && one_of(prop_kind, forward_training, forward_inference)
         && one_of(alg_kind, binarization_depthwise);
     if (!args_ok) return invalid_arguments;
@@ -46,6 +46,7 @@ status_t binarization_desc_init(binarization_desc_t *binarization_desc, prop_kin
     bd.src_desc = *src_desc;
     bd.dst_desc = *dst_desc;
     bd.weights_desc = *weights_desc;
+    bd.output_mask_desc = *output_mask_desc;
 
     bool consistency = true
         && memory_desc_wrapper(bd.src_desc).nelems()
@@ -59,8 +60,9 @@ status_t binarization_desc_init(binarization_desc_t *binarization_desc, prop_kin
 
 status_t mkldnn_binarization_forward_desc_init(binarization_desc_t *binarization_desc,
         prop_kind_t prop_kind, alg_kind_t alg_kind,
-        const memory_desc_t *src_desc, const memory_desc_t *dst_desc, const memory_desc_t *weights_desc) {
+        const memory_desc_t *src_desc, const memory_desc_t *dst_desc, const memory_desc_t *weights_desc,
+        const memory_desc_t *output_mask_desc) {
     if (!one_of(prop_kind, forward_training, forward_inference))
         return invalid_arguments;
-    return binarization_desc_init(binarization_desc, prop_kind, alg_kind, src_desc, dst_desc, weights_desc);
+    return binarization_desc_init(binarization_desc, prop_kind, alg_kind, src_desc, dst_desc, weights_desc, output_mask_desc);
 }
index 1450230..95016b9 100644 (file)
@@ -47,14 +47,14 @@ struct binarization_fwd_pd_t: public primitive_desc_t {
     virtual const memory_pd_t *input_pd(int index = 0) const override {
         switch (index) {
         case 0: return src_pd();
-        case 1: return weights_pd(index - 1);
+        case 1: case 2: return weights_pd(index - 1);
         default: return nullptr;
         }
     }
     virtual const memory_pd_t *output_pd(int index = 0) const override
     { return index == 0 ? dst_pd() : nullptr; }
 
-    virtual int n_inputs() const override { return 2; }
+    virtual int n_inputs() const override { return 3; }
     virtual int n_outputs() const override { return 1; }
 
     virtual status_t query(query_t what, int idx, void *result) const override
index d48ab95..18eab57 100644 (file)
@@ -129,7 +129,7 @@ status_t post_ops_t::append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, in
     return success;
 }
 
-status_t post_ops_t::append_binarization(alg_kind_t alg, const float* weights_data) {
+status_t post_ops_t::append_binarization(alg_kind_t alg, const float* weights_data, const float* output_mask_data) {
     using namespace mkldnn::impl::alg_kind;
     bool known_alg = one_of(alg, binarization_depthwise);
     if (!known_alg)
@@ -141,6 +141,7 @@ status_t post_ops_t::append_binarization(alg_kind_t alg, const float* weights_da
     entry_[len_].kind = primitive_kind::binarization;
     entry_[len_].binarization.alg = alg;
     entry_[len_].binarization.weights_data = weights_data;
+    entry_[len_].binarization.output_mask_data = output_mask_data;
 
     len_++;
 
@@ -405,24 +406,26 @@ status_t mkldnn_post_ops_get_params_dw_conv(const post_ops_t *post_ops,
     return success;
 }
 
-status_t mkldnn_post_ops_append_binarization(post_ops_t *post_ops, alg_kind_t kind, const float* weights_data) {
+status_t mkldnn_post_ops_append_binarization(post_ops_t *post_ops, alg_kind_t kind, const float* weights_data,
+        const float* output_mask_data) {
     if (post_ops == nullptr)
         return invalid_arguments;
 
-    return post_ops->append_binarization(kind, weights_data);
+    return post_ops->append_binarization(kind, weights_data, output_mask_data);
 }
 
 status_t mkldnn_post_ops_get_params_binarization(const post_ops_t *post_ops, int index, alg_kind_t *alg,
-        const float** weights_data) {
+        const float** weights_data, const float** output_mask_data) {
     bool ok = true
         && simple_get_params_check(post_ops, index, primitive_kind::binarization)
-        && !any_null(alg, weights_data);
+        && !any_null(alg, weights_data, output_mask_data);
     if (!ok)
         return invalid_arguments;
 
     const auto &e = post_ops->entry_[index].binarization;
     *alg = e.alg;
     *weights_data = e.weights_data;
+    *output_mask_data = e.output_mask_data;
 
     return success;
 }
index 949449f..887952f 100644 (file)
@@ -119,6 +119,7 @@ struct mkldnn_post_ops: public mkldnn::impl::c_compatible {
             struct {
                 mkldnn::impl::alg_kind_t alg;
                 const float* weights_data;
+                const float* output_mask_data;
             } binarization;
         };
 
@@ -167,7 +168,8 @@ struct mkldnn_post_ops: public mkldnn::impl::c_compatible {
     mkldnn::impl::status_t append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w,
                                           const float* weights_data,
                                           const float* biases_data);
-    mkldnn::impl::status_t append_binarization(mkldnn::impl::alg_kind_t alg, const float* weights_data);
+    mkldnn::impl::status_t append_binarization(mkldnn::impl::alg_kind_t alg, const float* weights_data,
+                                               const float* output_mask_data);
 
     int find(mkldnn::impl::primitive_kind_t kind, int start = 0,
             int stop = -1) const {
index 05d1059..b10a4e5 100644 (file)
@@ -39,7 +39,8 @@ struct cpu_binarization_fwd_pd_t: public binarization_fwd_pd_t {
         : binarization_fwd_pd_t(engine, adesc, attr, hint_fwd_pd)
         , src_pd_(engine_, &desc_.src_desc)
         , dst_pd_(engine_, &desc_.dst_desc)
-        , weights_pd_(engine_, &desc_.weights_desc) {}
+        , weights_pd_(engine_, &desc_.weights_desc)
+        , output_mask_pd_(engine_, &desc_.output_mask_desc) {}
     virtual ~cpu_binarization_fwd_pd_t() {}
 
     virtual const cpu_memory_pd_t *src_pd(int index = 0) const override
@@ -48,11 +49,12 @@ struct cpu_binarization_fwd_pd_t: public binarization_fwd_pd_t {
     { return index == 0 ? &dst_pd_ : nullptr; }
     virtual const cpu_memory_pd_t *weights_pd(int index = 0) const override {
         if (index == 0) return &weights_pd_;
+        if (index == 1) return &output_mask_pd_;
         return nullptr;
     }
 
 protected:
-    cpu_memory_pd_t src_pd_, dst_pd_, weights_pd_;
+    cpu_memory_pd_t src_pd_, dst_pd_, weights_pd_, output_mask_pd_;
 
     inline memory_format_t src_format()
     {
@@ -73,6 +75,8 @@ protected:
             CHECK(dst_pd_.set_format(src_pd_.desc()->format));
         if (weights_pd_.desc()->format == any)
             CHECK(weights_pd_.set_format(wei_format()));
+        if (output_mask_pd_.desc()->format == any)
+            CHECK(output_mask_pd_.set_format(wei_format()));
         return status::success;
     }
 
index 738725d..1d64a6f 100644 (file)
@@ -347,7 +347,7 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_uni_roi_pooling_fwd_t<sse42>),
     INSTANCE(ref_roi_pooling_fwd_t<data_type::f32>),
     /* binary convolution */
-//    INSTANCE(jit_uni_binary_convolution_fwd_t<avx512_common>),
+    INSTANCE(jit_uni_binary_convolution_fwd_t<avx512_common>),
     INSTANCE(jit_uni_binary_convolution_fwd_t<avx2>),
     INSTANCE(jit_uni_binary_convolution_fwd_t<sse42>),
     INSTANCE(ref_binary_convolution_fwd_t),
index b247724..e27f63d 100644 (file)
@@ -835,6 +835,17 @@ public:
         vpshufb(x1, x2, op);
     }
 
+    void uni_vpcmpeqd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
+                      const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pcmpeqd(x1, op);
+    }
+
+    void uni_vpcmpeqd(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
+                      const Xbyak::Operand &op) {
+        vpcmpeqd(x1, x2, op);
+    }
+
     void mul_by_const(const Xbyak::Reg &out,
             const Xbyak::Reg64 &tmp, int value) {
         // Generates a shift + add sequence for multiplicating contents of the
index 447a017..716770a 100644 (file)
@@ -157,7 +157,7 @@ void jit_uni_bin_conv_fwd_kernel<isa>::apply_filter(int ur_w, int pad_l, int pad
                 int inp_off = ((ki*dilate_w + jj*stride_w - pad_l)*div_up(jcp.ic, nbits) + ifm2 * div_up(ic_blk, nbits)) * jcp.typesize_in;
 
                 if (h_padded || jj < jj_start || jj >= jj_end) {
-                    uni_vmovups(vmm_src, ptr[reg_table + 256]);
+                    uni_vmovups(vmm_src, ptr[reg_table + 8 * vlen]);
                 } else {
                     uni_vpbroadcastd(vmm_src, ptr[aux1_reg_input + inp_off]);
                 }
@@ -172,7 +172,7 @@ void jit_uni_bin_conv_fwd_kernel<isa>::apply_filter(int ur_w, int pad_l, int pad
 
                         uni_vpxor(vmm_tmp, vmm_tmp, vmm_src);
                         if (jcp.ic_padded != jcp.ic && last_icb && ifm2 == (ic_blocks - 1))
-                            uni_vandps(vmm_tmp, vmm_tmp, ptr[reg_table + 224]);
+                            uni_vandps(vmm_tmp, vmm_tmp, ptr[reg_table + 7 * vlen]);
 
                         if (isa == sse42) {
                             movups(vmm_tmp1, vmm_tmp);
@@ -196,10 +196,14 @@ void jit_uni_bin_conv_fwd_kernel<isa>::apply_filter(int ur_w, int pad_l, int pad
                             uni_vpaddb(vmm_tmp, vmm_tmp, vmm_tmp1);
                         }
 
-                        uni_vpmaddubsw(vmm_tmp, vmm_tmp, vmm_one_u8);
-                        uni_vpmaddwd(vmm_tmp, vmm_tmp, vmm_one_s16);
-                        uni_vpaddd(Vmm(1 + r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj),
-                                   Vmm(1 + r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj), vmm_tmp);
+                        if (mayiuse(avx512_core_vnni)) {
+                            vpdpbusd(Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), vmm_tmp, vmm_one_u8);
+                        } else {
+                            uni_vpmaddubsw(vmm_tmp, vmm_tmp, vmm_one_u8);
+                            uni_vpmaddwd(vmm_tmp, vmm_tmp, vmm_one_s16);
+                            uni_vpaddd(Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj),
+                                       Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), vmm_tmp);
+                        }
                     }
                 }
             }
@@ -255,10 +259,10 @@ void jit_uni_bin_conv_fwd_kernel<isa>::kh_loop(int ur_w, int pad_l, int pad_r, i
     mov(aux_reg_input, reg_input);
     mov(aux_reg_kernel, reg_kernel_base);
 
-    uni_vmovups(vmm_lookup,  ptr[reg_table]);
-    uni_vmovups(vmm_mask,    ptr[reg_table + 32]);
-    uni_vmovups(vmm_one_u8,  ptr[reg_table + 160]);
-    uni_vmovups(vmm_one_s16, ptr[reg_table + 192]);
+    uni_vmovups(vmm_lookup,  ptr[reg_table + 0 * vlen]);
+    uni_vmovups(vmm_mask,    ptr[reg_table + 1 * vlen]);
+    uni_vmovups(vmm_one_u8,  ptr[reg_table + 5 * vlen]);
+    uni_vmovups(vmm_one_s16, ptr[reg_table + 6 * vlen]);
 
     if (!jcp.exclude_pad) {
         mov(reg_overflow,  ptr[param1 + GET_OFF(t_overflow)]);
@@ -329,6 +333,12 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
 
     kh_loop(ur_w, pad_l, pad_r, oc_blocks, oc_step);
 
+    if (isa == avx512_common && oc_step != jcp.oc_block) {
+        int mask = (1 << oc_step) - 1;
+        mov(reg_tmp_32, mask);
+        kmovw(ktail_mask, reg_tmp_32);
+    }
+
     const auto &p = attr_.post_ops_;
     for (int r = 0; r < repeats; r++) {
         int tail_size = isa == sse42 ? nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2) : oc_step;
@@ -351,9 +361,9 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
                 }
             }
         } else {
-            uni_vmovups(vmm_shift, ptr[reg_table + 128]);
+            uni_vmovups(vmm_shift, ptr[reg_table + 4 * vlen]);
         }
-        uni_vmovups(vmm_scale, ptr[reg_table + 96]);
+        uni_vmovups(vmm_scale, ptr[reg_table + 3 * vlen]);
 
         for (int jj = 0; jj < ur_w; jj++) {
             if (jcp.exclude_pad) {
@@ -411,21 +421,30 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
                         Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj);
 
                         if (is_scalar_store) {
-                            for (int oc = 0; oc < tail_size; oc++) {
-                                int o_off =  jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc;
-
-                                uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
-                                cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true);
-
-                                if (oc < jcp.oc_block / 2) {
-                                    uni_vpslldq(vmm_sum, vmm_sum, oc * sizeof(float));
-                                } else {
-                                    Ymm ymm_prev_dst = Ymm(vmm_sum.getIdx());
-                                    vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01);
-                                    vpslldq(vmm_sum, vmm_sum, (oc - jcp.oc_block / 2) * sizeof(float));
-                                }
+                            if (isa == avx512_common) {
+                                int o_off =  jj * jcp.oc * jcp.ngroups;
+
+                                Vmm vmm_in = vmm_sum | ktail_mask | T_z;
 
+                                vmovups(vmm_in, ptr[reg_output + o_off * jcp.typesize_out]);
                                 uni_vaddps(vmm_dst, vmm_dst, vmm_sum);
+                            } else {
+                                for (int oc = 0; oc < tail_size; oc++) {
+                                    int o_off =  jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc;
+
+                                    uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
+                                    cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true);
+
+                                    if (oc < jcp.oc_block / 2) {
+                                        uni_vpslldq(vmm_sum, vmm_sum, oc * sizeof(float));
+                                    } else {
+                                        Ymm ymm_prev_dst = Ymm(vmm_sum.getIdx());
+                                        vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01);
+                                        vpslldq(vmm_sum, vmm_sum, (oc - jcp.oc_block / 2) * sizeof(float));
+                                    }
+
+                                    uni_vaddps(vmm_dst, vmm_dst, vmm_sum);
+                                }
                             }
                         } else {
                             size_t o_off = ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2);
@@ -445,7 +464,9 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
         pop(reg_oc_off);
 
         mov(reg_b_weights, reinterpret_cast<size_t>(p.entry_[binarization_idx].binarization.weights_data));
+        mov(reg_b_out_mask, reinterpret_cast<size_t>(p.entry_[binarization_idx].binarization.output_mask_data));
         add(reg_b_weights, reg_oc_off);
+        add(reg_b_out_mask, reg_oc_off);
 
         push(reg_oc_off);
 
@@ -455,13 +476,25 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
                     int tail_size = isa == sse42 ? nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2) : oc_step;
                     mov(reg_b_mask, (1 << tail_size) - 1);
                     uni_vmovups(vmm_thr, ptr[reg_b_weights + (ii * jcp.oc_block + r * (jcp.oc_block / 2)) * sizeof(float)]);
+                    uni_vmovups(vmm_out_mask, ptr[reg_b_out_mask + (ii * jcp.oc_block + r * (jcp.oc_block / 2)) * sizeof(float)]);
 
                     Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj);
 
-                    uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr);
+                    if (isa == avx512_common) {
+                        vcmpps(bin_mask0, vmm_dst, vmm_thr, _cmp_gt_os);
+                        vptestmd(bin_mask1, vmm_out_mask, vmm_out_mask);
+                        kxnorw(bin_mask0, bin_mask0, bin_mask1);
+                    } else {
+                        uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr);
+                        uni_vpcmpeqd(vmm_dst, vmm_dst, vmm_out_mask);
+                    }
 
                     if (r == 0) {
-                        uni_vmovmskps(reg_tmp_32, vmm_dst);
+                        if (isa == avx512_common) {
+                            kmovw(reg_tmp_32, bin_mask0);
+                        } else {
+                            uni_vmovmskps(reg_tmp_32, vmm_dst);
+                        }
                         and_(reg_tmp_64, reg_b_mask);
                     } else {
                         uni_vmovmskps(reg_tmp2_32, vmm_dst);
@@ -471,8 +504,13 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
                     }
 
                     if (r == repeats - 1) {
-                        const size_t o_off = (ii + jj * div_up(jcp.oc, nbits));
-                        mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8);
+                        if (isa == avx512_common && oc_step > nbits) {
+                            const size_t o_off = (2 * ii + jj * div_up(jcp.oc, nbits));
+                            mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_16);
+                        } else {
+                            const size_t o_off = (ii + jj * div_up(jcp.oc, nbits));
+                            mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8);
+                        }
                     }
                 }
             }
@@ -484,22 +522,33 @@ void jit_uni_bin_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int p
             if (is_scalar_store) {
                 for (int jj = 0; jj < ur_w; jj++) {
                     Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + jj);
-                    Ymm ymm_dst = Ymm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + jj);
 
-                    for (int oc = 0; oc < tail_size; oc++) {
+                    if (isa == avx512_common) {
                         size_t o_off;
                         if (jcp.with_dw_conv)
-                            o_off = jj * jcp.oc_block + oc + r * (jcp.oc_block / 2);
+                            o_off = jj * jcp.oc_block;
                         else
-                            o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc;
+                            o_off = jj * jcp.oc * jcp.ngroups;
 
-                        store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
-
-                        if (isa == sse42) {
-                            psrldq(vmm_dst, jcp.typesize_out);
-                        } else {
-                            vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01);
-                            vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out);
+                        uni_vmovups(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst | ktail_mask);
+                    } else {
+                        for (int oc = 0; oc < tail_size; oc++) {
+                            size_t o_off;
+                            if (jcp.with_dw_conv)
+                                o_off = jj * jcp.oc_block + oc + r * (jcp.oc_block / 2);
+                            else
+                                o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc;
+
+                            store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+
+                            if (isa == sse42) {
+                                psrldq(vmm_dst, jcp.typesize_out);
+                            } else {
+                                Ymm ymm_dst = Ymm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + jj);
+
+                                vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01);
+                                vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out);
+                            }
                         }
                     }
                 }
@@ -655,7 +704,7 @@ void jit_uni_bin_conv_fwd_kernel<isa>::generate()
             add(reg_output_base, jcp.oc_block * jcp_dw_conv.kh * jcp.ow * jcp.typesize_out);
         } else {
             if (jcp.with_binarization)
-                add(reg_output_base, jcp.typesize_out);
+                add(reg_output_base, div_up(jcp.oc_block, nbits) * jcp.typesize_out);
             else
                 add(reg_output_base, jcp.oc_block * jcp.typesize_out);
         }
@@ -687,10 +736,6 @@ void jit_uni_bin_conv_fwd_kernel<isa>::prepare_table() {
             0x03020201, // 1 2 2 3
             0x03020201, // 1 2 2 3
             0x04030302,  // 2 3 3 4
-            0x02010100, // 0 1 1 2
-            0x03020201, // 1 2 2 3
-            0x03020201, // 1 2 2 3
-            0x04030302,  // 2 3 3 4
             0x0f0f0f0f,
             0x000000ff,
             0xc0000000, // -2.0f
@@ -698,45 +743,47 @@ void jit_uni_bin_conv_fwd_kernel<isa>::prepare_table() {
             0x00010001
     };
 
+    size_t simd_w = vlen / sizeof(int32_t);
+
     align(64);
     L(l_table);
     // offset = 0
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[d % 8]);
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[d % 4]);
     }
-    // offset = 32
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[8]);
+    // offset = 1
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[4]);
     }
-    // offset = 64
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[9]);
+    // offset = 2
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[5]);
     }
-    // offset = 96
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[10]);
+    // offset = 3
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[6]);
     }
 
-    // offset = 128
-    for (size_t d = 0; d < 8; ++d) {
+    // offset = 4
+    for (size_t d = 0; d < simd_w; ++d) {
         dd(float2int(jcp.ic * jcp.kw * jcp.kh));
     }
 
-    // offset = 160
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[11]);
+    // offset = 5
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[7]);
     }
-    // offset = 192
-    for (size_t d = 0; d < 8; ++d) {
-        dd(cvals[12]);
+    // offset = 6
+    for (size_t d = 0; d < simd_w; ++d) {
+        dd(cvals[8]);
     }
-    // offset = 224
-    for (size_t d = 0; d < 8; ++d) {
+    // offset = 7
+    for (size_t d = 0; d < simd_w; ++d) {
         uint32_t mask = 0xffffffff >> (jcp.ic_padded - jcp.ic);
         dd(mask);
     }
-    // offset = 256
-    for (size_t d = 0; d < 8; ++d) {
+    // offset = 8
+    for (size_t d = 0; d < simd_w; ++d) {
         uint32_t val = jcp.pad_value == 1.0f ? 0xffffffff : 0x00000000;
         dd(val);
     }
@@ -854,35 +901,11 @@ status_t jit_uni_bin_conv_fwd_kernel<isa>::init_conf(jit_bin_conv_conf_t &jcp,
         && dst_d.format() == nhwc;
     if (!args_ok) return status::unimplemented;
 
-    jcp.ur_h = 1; /* no code-unrolling by h so far */
-    jcp.ur_w = 2;
+    jcp.ur_h = 1;
+    jcp.ur_w = isa == avx512_common ? 4 : 2;
     if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow;
     jcp.ur_w_tail = jcp.ow % jcp.ur_w;
 
-    jcp.nb_oc_blocking = isa == sse42 ? 2 : 4; /* the optimal value for the kernel */
-
-    args_ok = true
-        && jcp.l_pad <= jcp.ur_w
-        && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
-                || (jcp.stride_w == 1 && jcp.stride_h == 1));
-    if (!args_ok) return status::unimplemented;
-
-    int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
-        + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
-
-    if (r_pad_no_tail > jcp.ur_w) {
-        /* recalculate ur_w, nb_oc_blocking and ur_w_tail */
-        jcp.ur_w = r_pad_no_tail + 1;
-        jcp.nb_oc_blocking = ((16 - 1)-jcp.ur_w)/jcp.ur_w;
-        jcp.ur_w_tail = jcp.ow % jcp.ur_w;
-        /* check again ... */
-        r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
-            + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
-        if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w))
-            return status::unimplemented;
-    }
-    if (jcp.l_pad > jcp.ur_w) return status::unimplemented;
-
     jcp.ic_block = 32;
     jcp.nb_ic = div_up(jcp.ic, jcp.ic_block);
     jcp.ic_padded = rnd_up(jcp.ic, jcp.ic_block);
@@ -891,6 +914,7 @@ status_t jit_uni_bin_conv_fwd_kernel<isa>::init_conf(jit_bin_conv_conf_t &jcp,
     jcp.nb_oc = div_up(jcp.oc, jcp.oc_block);
 
     jcp.nb_ic_blocking = 1;
+    jcp.nb_oc_blocking = nstl::min(isa == sse42 ? 2 : isa == avx2 ? 4 : 6, jcp.nb_oc);
 
     jcp.src_dt = cd.src_desc.data_type;
     jcp.bia_dt = mkldnn_f32;
@@ -900,6 +924,19 @@ status_t jit_uni_bin_conv_fwd_kernel<isa>::init_conf(jit_bin_conv_conf_t &jcp,
     jcp.typesize_out = types::data_type_size(jcp.dst_dt);
     jcp.typesize_acc = sizeof(int32_t);
 
+    args_ok = true
+        && jcp.l_pad <= jcp.ur_w
+        && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
+                || (jcp.stride_w == 1 && jcp.stride_h == 1));
+    if (!args_ok) return status::unimplemented;
+
+    int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
+        + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
+    if (r_pad_no_tail > jcp.ur_w)
+        return status::unimplemented;
+
+    if (jcp.l_pad > jcp.ur_w) return status::unimplemented;
+
     return status::success;
 }
 
index 83f6f6a..c300bae 100644 (file)
@@ -65,9 +65,10 @@ private:
     using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
             isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
     using Ymm = const Xbyak::Ymm;
-    using reg64_t = const Xbyak::Reg64;
-    using reg32_t = const Xbyak::Reg32;
     using reg8_t = const Xbyak::Reg8;
+    using reg16_t = const Xbyak::Reg16;
+    using reg32_t = const Xbyak::Reg32;
+    using reg64_t = const Xbyak::Reg64;
 
     reg64_t reg_input = r13;
     reg64_t reg_output = rbp;
@@ -87,9 +88,10 @@ private:
     reg64_t reg_table = r15;
     reg64_t reg_icb_iter = reg_oc_work;
 
+    reg8_t reg_tmp_8 = r12b;
+    reg16_t reg_tmp_16 = r12w;
     reg32_t reg_tmp_32 = r12d;
     reg64_t reg_tmp_64 = r12;
-    reg8_t reg_tmp_8 = r12b;
 
     reg64_t reg_d_weights = aux_reg_input;
     reg64_t reg_d_bias = aux_reg_kernel;
@@ -99,22 +101,32 @@ private:
 
     reg64_t reg_b_weights = aux_reg_input;
     reg64_t reg_b_mask = aux_reg_kernel;
+    reg64_t reg_b_out_mask = reg_icb_iter;
 
     reg64_t reg_shift = aux_reg_input;
 
-    Vmm vmm_scale = Vmm(14);
-    Vmm vmm_shift = Vmm(15);
-    Vmm vmm_sum = Vmm(10);
-    Vmm vmm_lookup = Vmm(12);
-    Vmm vmm_mask = Vmm(13);
-    Vmm vmm_one_u8 = Vmm(14);
-    Vmm vmm_one_s16 = Vmm(15);
-    Ymm ymm_tmp = Ymm(10);
-    Vmm vmm_tmp = Vmm(10);
-    Vmm vmm_tmp1 = Vmm(11);
+    Vmm vmm_scale = Vmm(isa == avx512_common ? 30 : 14);
+    Vmm vmm_shift = Vmm(0);
+    Vmm vmm_sum = Vmm(isa == avx512_common ? 26 : 10);
+    Vmm vmm_lookup = Vmm(isa == avx512_common ? 28 : 12);
+    Vmm vmm_mask = Vmm(isa == avx512_common ? 29 : 13);
+    Vmm vmm_one_u8 = Vmm(isa == avx512_common ? 30 : 14);
+    Vmm vmm_one_s16 = Vmm(isa == avx512_common ? 31 : 15);
+    Ymm ymm_tmp = Ymm(isa == avx512_common ? 26 : 10);
+    Vmm vmm_tmp = Vmm(isa == avx512_common ? 26 : 10);
+    Vmm vmm_tmp1 = Vmm(isa == avx512_common ? 27 : 11);
     Vmm vmm_src = Vmm(0);
-    Vmm vmm_tmp2 = Vmm(9);
-    Vmm vmm_thr = Vmm(10);
+    Vmm vmm_tmp2 = Vmm(isa == avx512_common ? 25 : 9);
+    Vmm vmm_thr = Vmm(isa == avx512_common ? 26 : 10);
+    Vmm vmm_out_mask = Vmm(isa == avx512_common ? 30 : 14);
+
+    const unsigned char _cmp_gt_os = 6;
+
+    Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
+    Xbyak::Opmask bin_mask0 = Xbyak::Opmask(5);
+    Xbyak::Opmask bin_mask1 = Xbyak::Opmask(6);
+
+    size_t vlen = cpu_isa_traits<isa>::vlen;
 
     Xbyak::Label l_table;
 
index be3b284..f0f08cc 100644 (file)
@@ -34,6 +34,7 @@ struct jit_args {
     const float* from;
     const uint8_t* to;
     const float* weights;
+    const float* output_mask;
     size_t work_amount;
 };
 
@@ -58,7 +59,7 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_bin_depthwise_kernel_f32)
     jit_uni_bin_depthwise_kernel_f32(const binarization_desc_t &desc)
         : jit_uni_binarization_kernel_f32(desc), jit_generator() {
-        assert(desc.alg_kind == alg_kind::binarization_depthwise);
+        assert(one_of(desc.alg_kind, alg_kind::binarization_depthwise));
         assert(isa == sse42 || isa == avx2 || isa == avx512_common);
 
         this->preamble();
@@ -66,10 +67,11 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
         mov(reg_from, ptr[param + GET_OFF(from)]);
         mov(reg_to, ptr[param + GET_OFF(to)]);
         mov(reg_weights, ptr[param + GET_OFF(weights)]);
+        mov(reg_output_mask, ptr[param + GET_OFF(output_mask)]);
         mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]);
 
         const int nbits = 8;
-       int simd_w = isa == avx512_common ? 16 : 8;
+        int simd_w = isa == avx512_common ? 16 : 8;
         const int C = desc.src_desc.dims[1];
         const int tail_size = C % simd_w;
 
@@ -90,13 +92,17 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
             for (int ch = 0; ch < ur_ch; ch++) {
                 uni_vmovups(vmm_src(0), ptr[reg_from + ch*step*sizeof(float)]);
                 uni_vmovups(vmm_wei(0), ptr[reg_weights + ch*step*sizeof(float)]);
-               if (isa == avx512_common) {
-                   vcmpps(k_mask, vmm_src(0), vmm_wei(0), _cmp_gt_os);
-                   kmovw(reg_src_32, k_mask);
-               } else {
+                uni_vmovups(vmm_mask(0), ptr[reg_output_mask + ch*step*sizeof(float)]);
+                if (isa == avx512_common) {
+                    vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os);
+                    vptestmd(k_mask1, vmm_mask(0), vmm_mask(0));
+                    kxnorw(k_mask0, k_mask0, k_mask1);
+                    kmovw(reg_src_32, k_mask0);
+                } else {
                     uni_vcmpgtps(vmm_src(0), vmm_src(0), vmm_wei(0));
-                   uni_vmovmskps(reg_src_32, vmm_src(0));
-               }
+                    uni_vpcmpeqd(vmm_src(0), vmm_src(0), vmm_mask(0));
+                    uni_vmovmskps(reg_src_32, vmm_src(0));
+                }
                 shl(reg_src_32, ch * step);
                 or_(reg_bin_32, reg_src_32);
             }
@@ -104,6 +110,7 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
 
             add(reg_from, unrolled_loop_step*sizeof(float));
             add(reg_weights, unrolled_loop_step*sizeof(float));
+            add(reg_output_mask, unrolled_loop_step*sizeof(float));
             add(reg_to, sizeof(uint32_t));
             sub(reg_work_amount, unrolled_loop_step);
 
@@ -122,23 +129,28 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
             for (int i = 0; i < repeats; i++) {
                 uni_vmovups(vmm_src(0), ptr[reg_from + i*step*sizeof(float)]);
                 uni_vmovups(vmm_wei(0), ptr[reg_weights + i*step*sizeof(float)]);
+                uni_vmovups(vmm_mask(0), ptr[reg_output_mask + i*step*sizeof(float)]);
                 if (isa == avx512_common) {
-                    vcmpps(k_mask, vmm_src(0), vmm_wei(0), _cmp_gt_os);
-                   kmovw(reg_src_32, k_mask);
+                    vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os);
+                    vptestmd(k_mask1, vmm_mask(0), vmm_mask(0));
+                    kxnorw(k_mask0, k_mask0, k_mask1);
+                    kmovw(reg_src_32, k_mask0);
                 } else {
                     uni_vcmpgtps(vmm_src(0), vmm_src(0), vmm_wei(0));
-                   uni_vmovmskps(reg_src_32, vmm_src(0));
+                    uni_vpcmpeqd(vmm_src(0), vmm_src(0), vmm_mask(0));
+                       uni_vmovmskps(reg_src_32, vmm_src(0));
                 }
                 shl(reg_src_32, i * step);
                 or_(reg_bin_32, reg_src_32);
             }
-           if (isa == avx512_common)
+            if (isa == avx512_common)
                 mov(ptr[reg_to], reg_bin_16);
-           else        
+            else
                 mov(ptr[reg_to], reg_bin_8);
 
             add(reg_from, main_loop_step*sizeof(float));
             add(reg_weights, main_loop_step*sizeof(float));
+            add(reg_output_mask, main_loop_step*sizeof(float));
             add(reg_to, isa == avx512_common ? sizeof(uint16_t) : sizeof(uint8_t));
             sub(reg_work_amount, main_loop_step);
 
@@ -148,22 +160,28 @@ struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32
         L(tail_label); {
             if (tail_size != 0) {
                 xor_(reg_bin_32, reg_bin_32);
+                mov(reg_mask, 1);
                 for (int c = 0; c < tail_size; c++) {
                     uni_vpxor(xmm_src(0), xmm_src(0), xmm_src(0));
                     uni_vpxor(xmm_wei(0), xmm_wei(0), xmm_wei(0));
+                    uni_vpxor(xmm_mask(0), xmm_mask(0), xmm_mask(0));
 
                     movss(xmm_src(0), ptr[reg_from + c * sizeof(float)]);
                     movss(xmm_wei(0), ptr[reg_weights + c * sizeof(float)]);
+                    movss(xmm_mask(0), ptr[reg_output_mask + c * sizeof(float)]);
                     uni_vcmpgtps(xmm_src(0), xmm_src(0), xmm_wei(0));
+                    uni_vpcmpeqd(xmm_src(0), xmm_src(0), xmm_mask(0));
                     uni_vmovmskps(reg_src_32, xmm_src(0));
 
                     shl(reg_src_32, c);
+                    and_(reg_src_32, reg_mask);
                     or_(reg_bin_32, reg_src_32);
+                    shl(reg_mask, 1);
                 }
-               if (isa == avx512_common && tail_size > nbits)
-                    mov(ptr[reg_to], reg_bin_16);
-               else
-                   mov(ptr[reg_to], reg_bin_8);
+                if (isa == avx512_common && tail_size > nbits)
+                       mov(ptr[reg_to], reg_bin_16);
+                else
+                       mov(ptr[reg_to], reg_bin_8);
             }
         }
 
@@ -181,21 +199,25 @@ private:
     inline Vmm vmm_src(int idx) { return Vmm(idx); }
     inline Xmm xmm_src(int idx) { return Xmm(idx); }
     inline Vmm vmm_wei(int idx) { return Vmm(idx + 4); }
+    inline Vmm vmm_mask(int idx) { return Vmm(idx + 5); }
     inline Xmm xmm_wei(int idx) { return Xmm(idx + 4); }
+    inline Xmm xmm_mask(int idx) { return Xmm(idx + 5); }
 
     Reg64 param = abi_param1;
     Reg64 reg_from = r8;
     Reg64 reg_to = r9;
     Reg64 reg_work_amount = r10;
     Reg64 reg_weights = r11;
+    Reg64 reg_output_mask = r14;
     Reg16 reg_bin_16 = r12w;
     Reg32 reg_bin_32 = r12d;
     Reg8 reg_bin_8 = r12b;
     Reg32 reg_src_32 = r13d;
-    Reg64 reg_src_64 = r13;
+    Reg32 reg_mask = r15d;
 
     const unsigned char _cmp_gt_os = 6;
-    Xbyak::Opmask k_mask = Xbyak::Opmask(1);
+    Xbyak::Opmask k_mask0 = Xbyak::Opmask(1);
+    Xbyak::Opmask k_mask1 = Xbyak::Opmask(2);
 };
 
 } /* namespace */
@@ -209,12 +231,14 @@ status_t jit_uni_binarization_fwd_t<isa>::pd_t::init() {
     assert(engine()->kind() == engine_kind::cpu);
     bool ok = true && mayiuse(isa)
         && utils::one_of(desc()->prop_kind, prop_kind::forward_training, prop_kind::forward_inference)
-        && utils::everyone_is(data_type::f32, desc()->src_desc.data_type, desc()->weights_desc.data_type)
+        && utils::everyone_is(data_type::f32, desc()->src_desc.data_type, desc()->weights_desc.data_type,
+                                              desc()->output_mask_desc.data_type)
         && utils::everyone_is(data_type::bin, desc()->dst_desc.data_type)
         && desc()->src_desc.format == desc()->dst_desc.format
         && utils::one_of(desc()->src_desc.format, desired_fmt)
         && utils::one_of(desc()->dst_desc.format, desired_fmt)
         && utils::one_of(desc()->weights_desc.format, x)
+        && utils::one_of(desc()->output_mask_desc.format, x)
         && attr()->has_default_values();
 
     return ok ? status::success : status::unimplemented;
@@ -241,11 +265,13 @@ template <cpu_isa_t isa>
 void jit_uni_binarization_fwd_t<isa>::execute_forward() const {
     auto src = reinterpret_cast<const src_data_t*>(this->input_memory(0));
     auto weights = reinterpret_cast<const src_data_t*>(this->input_memory(1));
+    auto output_mask = reinterpret_cast<const src_data_t*>(this->input_memory(2));
     auto dst = reinterpret_cast<uint8_t*>(this->memory());
 
     const memory_desc_wrapper src_d(pd()->src_pd());
     const memory_desc_wrapper dst_d(pd()->dst_pd());
     const memory_desc_wrapper weights_d(pd()->weights_pd(0));
+    const memory_desc_wrapper output_mask_d(pd()->weights_pd(1));
 
     const int N = src_d.dims()[0];
     const int C = src_d.dims()[1];
@@ -261,6 +287,7 @@ void jit_uni_binarization_fwd_t<isa>::execute_forward() const {
         arg.from    = &src[src_d.blk_off(n, 0, h, w)];
         arg.to      = &dst[dst_d.blk_off(n, 0, h, w) / nbits];
         arg.weights = &weights[weights_d.blk_off(0)];
+        arg.output_mask = &output_mask[output_mask_d.blk_off(0)];
         arg.work_amount = (size_t)C;
 
         (*kernel_)(&arg);
index 9aad4f1..ac49aad 100644 (file)
@@ -740,18 +740,27 @@ void jit_uni_dw_conv_row_f32<isa>::apply_postprocessing(int ur_w, int oc_step) {
 
             for (int ow = 0; ow < ur_w; ow++) {
                 if (is_scalar_store) {
-                    for (int oc = 0; oc < tail_size; oc++) {
-                        int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2) + oc;
+                    if (isa == avx512_common) {
+                        int o_off = ow * ow_stride_;
 
-                        uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
-                        cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true);
-
-                        if (oc >= jcp.ch_block / 2) {
-                            vperm2i128(Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), 0x01);
-                        }
-                        uni_vpslldq(vmm_sum, vmm_sum, jcp.typesize_out * (oc % (jcp.ch_block / 2)));
+                        Vmm vmm_in = vmm_sum | ktail_mask | T_z;
 
+                        cvt2ps(jcp.dst_dt, vmm_in, ptr[reg_output + o_off * jcp.typesize_out], false);
                         uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum);
+                    } else {
+                        for (int oc = 0; oc < tail_size; oc++) {
+                            int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2) + oc;
+
+                            uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
+                            cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true);
+
+                            if (oc >= jcp.ch_block / 2) {
+                                vperm2i128(Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), 0x01);
+                            }
+                            uni_vpslldq(vmm_sum, vmm_sum, jcp.typesize_out * (oc % (jcp.ch_block / 2)));
+
+                            uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum);
+                        }
                     }
                 } else {
                     int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2);
@@ -854,8 +863,15 @@ void jit_uni_dw_conv_row_f32<isa>::store_dst_typed(const Xbyak::Address &op, Vmm
 
 template <cpu_isa_t isa>
 void jit_uni_dw_conv_row_f32<isa>::store_dst(int ur_w, int oc_step) {
+    int nbits = 8;
     int repeats = isa == sse42 && oc_step > (jcp.ch_block / 2) ? 2 : 1;
 
+    if (isa == avx512_common && oc_step != jcp.ch_block) {
+        int mask = (1 << oc_step) - 1;
+        mov(reg_tmp_32, mask);
+        kmovw(ktail_mask, reg_tmp_32);
+    }
+
     for (int i = 0; i < repeats; i++) {
         for (int ow = 0; ow < ur_w; ow++) {
             Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
@@ -872,26 +888,42 @@ void jit_uni_dw_conv_row_f32<isa>::store_dst(int ur_w, int oc_step) {
     }
 
     if (jcp.with_binarization) {
-        int output_step = div_up(ow_stride_, 8);
+        int output_step = div_up(ow_stride_, nbits);
 
         const auto &p = attr_.post_ops_;
         int binarization_idx = p.find(primitive_kind::binarization);
 
+        push(reg_bias);
+
         mov(reg_b_weights, reinterpret_cast<size_t>(p.entry_[binarization_idx].binarization.weights_data));
+        mov(reg_b_out_mask, reinterpret_cast<size_t>(p.entry_[binarization_idx].binarization.output_mask_data));
         add(reg_b_weights, reg_oc_off);
+        add(reg_b_out_mask, reg_oc_off);
 
         for (int ow = 0; ow < ur_w; ow++) {
             for (int i = 0; i < repeats; i++) {
                 int tail_size = isa == sse42 ? nstl::min(jcp.ch_block / 2, oc_step - i * jcp.ch_block / 2) : oc_step;
                 mov(reg_b_mask, (1 << tail_size) - 1);
                 uni_vmovups(vmm_thr, ptr[reg_b_weights + i * (jcp.ch_block / 2) * sizeof(float)]);
+                uni_vmovups(vmm_out_mask, ptr[reg_b_out_mask + i * (jcp.ch_block / 2) * sizeof(float)]);
 
                 Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
 
-                uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr);
+                if (isa == avx512_common) {
+                    vcmpps(bin_mask0, vmm_dst, vmm_thr, _cmp_gt_os);
+                    vptestmd(bin_mask1, vmm_out_mask, vmm_out_mask);
+                    kxnorw(bin_mask0, bin_mask0, bin_mask1);
+                } else {
+                    uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr);
+                    uni_vpcmpeqd(vmm_dst, vmm_dst, vmm_out_mask);
+                }
 
                 if (i == 0) {
-                    uni_vmovmskps(reg_tmp_32, vmm_dst);
+                    if (isa == avx512_common) {
+                        kmovw(reg_tmp_32, bin_mask0);
+                    } else {
+                        uni_vmovmskps(reg_tmp_32, vmm_dst);
+                    }
                     and_(reg_tmp_64, reg_b_mask);
                 } else {
                     uni_vmovmskps(reg_tmp2_32, vmm_dst);
@@ -902,10 +934,16 @@ void jit_uni_dw_conv_row_f32<isa>::store_dst(int ur_w, int oc_step) {
 
                 if (i == repeats - 1) {
                     const size_t o_off = ow * output_step;
-                    mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8);
+                    if (isa == avx512_common && oc_step > nbits) {
+                        mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_16);
+                    } else {
+                        mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8);
+                    }
                 }
             }
         }
+
+        pop(reg_bias);
     } else {
         for (int i = 0; i < repeats; i++) {
             int tail_size = isa == sse42 ? nstl::min(jcp.ch_block / 2, oc_step - i * jcp.ch_block / 2) : oc_step;
@@ -913,17 +951,24 @@ void jit_uni_dw_conv_row_f32<isa>::store_dst(int ur_w, int oc_step) {
             if (is_scalar_store) {
                 for (int ow = 0; ow < ur_w; ow++) {
                     Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
-                    Ymm ymm_dst = Ymm(vmm_dst.getIdx());
 
-                    for (int oc = 0; oc < tail_size; oc++) {
-                        int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2) + oc;
-                        store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+                    if (isa == avx512_common) {
+                        int o_off = ow * ow_stride_;
+
+                        store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst | ktail_mask, false);
+                    } else {
+                        for (int oc = 0; oc < tail_size; oc++) {
+                            int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2) + oc;
+                            store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+
+                            if (isa == sse42) {
+                                psrldq(vmm_dst, jcp.typesize_out);
+                            } else {
+                                Ymm ymm_dst = Ymm(vmm_dst.getIdx());
 
-                        if (isa == sse42) {
-                            psrldq(vmm_dst, jcp.typesize_out);
-                        } else {
-                            vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01);
-                            vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out);
+                                vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01);
+                                vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out);
+                            }
                         }
                     }
                 }
index 47d93c8..d981166 100644 (file)
@@ -150,6 +150,7 @@ private:
         isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
     using reg64_t = const Xbyak::Reg64;
     using reg32_t = const Xbyak::Reg32;
+    using reg16_t = const Xbyak::Reg16;
     using reg8_t = const Xbyak::Reg8;
     const Xbyak::AddressFrame &vmmword = (isa == sse42)
         ? xword : (isa == avx2) ? yword : zword;
@@ -177,10 +178,12 @@ private:
 
     reg64_t reg_b_weights = r15;
     reg64_t reg_b_mask = reg_d_bias;
+    reg64_t reg_b_out_mask = rbx;
 
     reg32_t reg_tmp_32 = r11d;
     reg64_t reg_tmp_64 = r11;
     reg8_t reg_tmp_8 = r11b;
+    reg16_t reg_tmp_16 = r11w;
 
     reg32_t reg_tmp2_32 = r13d;
     reg64_t reg_tmp2_64 = r13;
@@ -194,6 +197,13 @@ private:
     Vmm vmm_sum = Vmm(0);
     Vmm vmm_bias = Vmm(0);
     Vmm vmm_thr = Vmm(0);
+    Vmm vmm_out_mask = Vmm(1);
+
+    const unsigned char _cmp_gt_os = 6;
+
+    Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
+    Xbyak::Opmask bin_mask0 = Xbyak::Opmask(5);
+    Xbyak::Opmask bin_mask1 = Xbyak::Opmask(6);
 
     inline void load_src(int ur_w);
     inline void apply_filter(int ur_w, int kw_size);
index 4fa9372..dc0b8dd 100644 (file)
@@ -34,11 +34,13 @@ template <impl::data_type_t src_type>
 void ref_binarization_fwd_t<src_type>::execute_forward() const {
     auto src = reinterpret_cast<const src_data_t*>(this->input_memory(0));
     auto weights = reinterpret_cast<const src_data_t*>(this->input_memory(1));
+    auto output_mask = reinterpret_cast<const uint32_t*>(this->input_memory(2));
     auto dst = reinterpret_cast<uint8_t*>(this->memory());
 
     const memory_desc_wrapper src_d(pd()->src_pd());
     const memory_desc_wrapper dst_d(pd()->dst_pd());
     const memory_desc_wrapper weights_d(pd()->weights_pd(0));
+    const memory_desc_wrapper output_mask_d(pd()->weights_pd(1));
 
     int nbits = 8;
 
@@ -61,11 +63,15 @@ void ref_binarization_fwd_t<src_type>::execute_forward() const {
                                 : src_d.off(n, c);
 
             size_t wei_off = weights_d.off(c);
+            size_t out_mask_off = output_mask_d.off(c);
 
             float val = src[src_off];
             float thr = weights[wei_off];
+            uint32_t out_mask = output_mask[out_mask_off];
 
-            auto bit = uint8_t((val > thr) ? 0x01 : 0x00);
+            uint32_t res = (val > thr) ? 0xffffffff : 0x00000000;
+
+            auto bit = uint8_t(res == out_mask);
             bin_val |= (bit << shift);
         }
 
index 726d700..891e705 100644 (file)
@@ -45,7 +45,8 @@ struct ref_binarization_fwd_t: public cpu_primitive_t {
 
             bool ok = true
                 && utils::one_of(desc()->prop_kind, forward_training, forward_inference)
-                && utils::everyone_is(src_type, desc()->src_desc.data_type, desc()->weights_desc.data_type)
+                && utils::everyone_is(src_type, desc()->src_desc.data_type, desc()->weights_desc.data_type,
+                        desc()->output_mask_desc.data_type)
                 && utils::everyone_is(data_type::bin, desc()->dst_desc.data_type)
                 && utils::one_of(desc()->alg_kind, mkldnn_binarization_depthwise)
                 && attr()->has_default_values();
index 2c9cbde..3ccdf95 100644 (file)
@@ -131,6 +131,7 @@ void _ref_binary_convolution_fwd_t::execute_forward() const {
 
         int binarization_idx = p.find(primitive_kind::binarization);
         const float* binarization_weights = p.entry_[binarization_idx].binarization.weights_data;
+        const uint32_t* binarization_output_mask = (uint32_t*)p.entry_[binarization_idx].binarization.output_mask_data;
 
         parallel_nd(G, MB, utils::div_up(OC, nbits), OD, OH, OW,
             [&](int g, int mb, int ocb, int od, int oh, int ow) {
@@ -194,7 +195,10 @@ void _ref_binary_convolution_fwd_t::execute_forward() const {
                 }
 
                 float thr = binarization_weights[g * OC + oc];
-                auto bit = uint8_t((a_fp > thr) ? 0x01 : 0x00);
+                uint32_t out_mask = binarization_output_mask[g * OC + oc];
+                uint32_t res = (a_fp > thr) ? 0xffffffff : 0x00000000;
+
+                auto bit = uint8_t((res == out_mask) ? 0x01 : 0x00);
                 bin_val |= (bit << shift);
             }
 
index 4e4a7da..792f719 100644 (file)
@@ -1150,7 +1150,7 @@ typename utils::enable_if<fmt_i == any && (fmt_o == OhIw8o32i || fmt_o == OhIw16
                         uint8_t bin_val = 0x00;
                         for (int ic = icb*nbits, shift = 0; ic < std::min(IC, (icb + 1)*nbits); ic++, shift++) {
                             size_t iidx = (i_mult_o * nb_oc + oc) * input_d.blocking_desc().strides[0][0] +
-                                          (i_mult_i * nb_ic + ic) *input_d.blocking_desc().strides[0][1] +
+                                          (i_mult_i * nb_ic + ic) * input_d.blocking_desc().strides[0][1] +
                                                                 h * input_d.blocking_desc().strides[0][2] +
                                                                 w;
 
@@ -1158,7 +1158,7 @@ typename utils::enable_if<fmt_i == any && (fmt_o == OhIw8o32i || fmt_o == OhIw16
                             bin_val |= (bit << shift);
                         }
 
-                        size_t oidx = wei_blk_off_like_gwei3D<fmt_o>(output_d, g, nb_oc, nb_ic, 0, h, w) + oc * blksize_i + icb * blksize_o;
+                        size_t oidx = wei_blk_off_like_gwei3D<fmt_o>(output_d, g, nb_oc, nb_ic, 0, h, w) + oc * blksize_i + icb * nbits;
                         output[oidx / nbits] = bin_val;
 
                     }
index e720faf..8b748b6 100644 (file)
@@ -31,13 +31,18 @@ struct binarization_test_params {
 
 template <typename src_data_t>
 void check_binarization_fwd(const binarization_test_params<src_data_t> &p,
-        const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) {
+        const memory::desc &src_md, const memory &src, const memory &weights,
+        const memory &output_low, const memory &output_high, const memory &dst) {
     auto src_data = (src_data_t*)src.get_data_handle();
     auto weights_data = (src_data_t*)weights.get_data_handle();
+    auto output_low_data = (float*)output_low.get_data_handle();
+    auto output_high_data = (float*)output_high.get_data_handle();
     auto dst_data = (uint8_t*)dst.get_data_handle();
 
     const memory::desc src_d = src.get_primitive_desc().desc();
     const memory::desc weights_d = weights.get_primitive_desc().desc();
+    const memory::desc output_low_d = output_low.get_primitive_desc().desc();
+    const memory::desc output_high_d = output_high.get_primitive_desc().desc();
     const memory::desc dst_d = dst.get_primitive_desc().desc();
 
     int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1;
@@ -63,8 +68,10 @@ void check_binarization_fwd(const binarization_test_params<src_data_t> &p,
 
                         src_data_t s_val = src_data[map_index(src_d, src_idx)];
                         src_data_t w_val = weights_data[map_index(weights_d, wei_idx)];
+                        src_data_t out_low = output_low_data[map_index(output_low_d, wei_idx)];
+                        src_data_t out_high = output_high_data[map_index(output_high_d, wei_idx)];
 
-                        auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00);
+                        auto bit = uint8_t((s_val > w_val) ? out_high : out_low);
                         bin_val |= (bit << shift);
                     }
 
@@ -95,28 +102,45 @@ protected:
 
         auto src_desc = create_md(src_dims, src_data_type, p.data_format);
         auto weights_desc = create_md(wei_dims, src_data_type, memory::format::x);
+        auto output_low_desc = create_md(wei_dims, src_data_type, memory::format::x);
+        auto output_high_desc = create_md(wei_dims, src_data_type, memory::format::x);
+        auto output_mask_desc = create_md(wei_dims, src_data_type, memory::format::x);
         auto dst_desc = create_md(dst_dims, memory::data_type::bin, p.data_format);
 
         auto src = test_memory(src_desc, eng);
         auto weights = test_memory(weights_desc, eng);
+        auto output_low = test_memory(output_low_desc, eng);
+        auto output_high = test_memory(output_high_desc, eng);
+        auto output_mask = test_memory(output_mask_desc, eng);
         auto dst = test_memory(dst_desc, eng);
 
         fill_data<src_data_t>(src.get_size() / sizeof(src_data_t), (src_data_t *)src.get().get_data_handle(),
                               src_data_t(0), src_data_t(1));
         fill_data<src_data_t>(weights.get_size() / sizeof(src_data_t), (src_data_t *)weights.get().get_data_handle(),
                               src_data_t(0), src_data_t(1));
+        fill_data<src_data_t>(output_low.get_size() / sizeof(src_data_t), (src_data_t *)output_low.get().get_data_handle(),
+                              src_data_t(0), src_data_t(1));
         fill_data<uint8_t>(dst.get_size() / sizeof(uint8_t), (uint8_t*)dst.get().get_data_handle());
 
+        src_data_t* p_output_low = (src_data_t *)output_low.get().get_data_handle();
+        src_data_t* p_output_high = (src_data_t *)output_high.get().get_data_handle();
+        uint32_t* p_output_mask = (uint32_t *)output_mask.get().get_data_handle();
+        for (int i = 0; i < src_dims[1]; i++) {
+            p_output_low[i] = p_output_low[i] >= 0 ? 1 : 0;
+            p_output_high[i] = p_output_low[i] == 1 ? 0 : 1;
+            p_output_mask[i] = p_output_high[i] == 1 ? 0xffffffff : 0x00000000;
+        }
+
         std::vector<primitive> pipeline;
-        auto binarization_desc = binarization_forward::desc(prop_kind::forward_training, p.alg_kind, src_desc, weights_desc, dst_desc);
+        auto binarization_desc = binarization_forward::desc(prop_kind::forward_training, p.alg_kind, src_desc, weights_desc, output_high_desc, dst_desc);
         auto binarization_prim_desc = binarization_forward::primitive_desc(binarization_desc, eng);
-        auto binarization = binarization_forward(binarization_prim_desc, src.get(), weights.get(), dst.get());
+        auto binarization = binarization_forward(binarization_prim_desc, src.get(), weights.get(), output_mask.get(), dst.get());
 
         pipeline.push_back(binarization);
         auto s = stream(stream::kind::lazy);
         s.submit(pipeline).wait();
 
-        check_binarization_fwd(p, src_desc, src.get(), weights.get(), dst.get());
+        check_binarization_fwd(p, src_desc, src.get(), weights.get(), output_low.get(), output_high.get(), dst.get());
     }
 };
 
index acdd555..52abd42 100644 (file)
@@ -62,6 +62,33 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
         2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16_Padded_Channels,
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked16_1x1_Padded_Channels,
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
+);
+
+
 //INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
 //    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
 //        2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
index 3293371..98478d3 100644 (file)
@@ -63,6 +63,33 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
         2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16_Padded_Channels,
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked16_1x1_Padded_Channels,
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
+);
+
+
 //INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
 //    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
 //        2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
index 8d0019a..1416312 100644 (file)
@@ -53,4 +53,20 @@ INST_TEST_CASE(Mobilenet_Blocked,
            2, 288, 2, 4,  240, 1, 1, 0, 0, 1, 1,  240, 3, 3, 1, 1, 1, 1)  // 5_3
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16,
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           1, 19, 5, 5,  77, 1, 1, 0, 0, 1, 1,  77, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(Mobilenet_Blocked16,
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 8, 19, 33,  56, 3, 3, 1, 1, 2, 2,  56, 3, 3, 1, 1, 1, 1), // 1_1
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 56, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 1, 1), // 2_2
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 112, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 2, 2), // 3_1
+    PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 288, 2, 4,  240, 1, 1, 0, 0, 1, 1,  240, 3, 3, 1, 1, 1, 1)  // 5_3
+);
+
 }
index 23c7ab1..070dce0 100644 (file)
@@ -43,4 +43,9 @@ INST_TEST_CASE(SimpleSmall_Blocked,
            1, 7, 10, 10,  37, 1, 1, 0, 0, 1, 1,  37, 3, 3, 1, 1, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16,
+    PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           1, 7, 10, 10,  37, 1, 1, 0, 0, 1, 1,  37, 3, 3, 1, 1, 1, 1)
+);
+
 }
index acbdb23..9be9229 100644 (file)
@@ -52,4 +52,9 @@ INST_TEST_CASE(Mobilenet_Blocked,
            1, 7, 10, 10,  37, 1, 1, 0, 0, 1, 1,  37, 3, 3, 1, 1, 2, 2)
 );
 
+INST_TEST_CASE(Mobilenet_Blocked16,
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           1, 7, 10, 10,  37, 1, 1, 0, 0, 1, 1,  37, 3, 3, 1, 1, 2, 2)
+);
+
 }
index c813834..e048fbf 100644 (file)
@@ -58,4 +58,29 @@ INST_TEST_CASE(Mobilenet_Blocked,
            2, 240, 2, 4,  264, 1, 1, 0, 0, 1, 1,  264, 3, 3, 1, 1, 1, 1)   // 5_4
 );
 
+INST_TEST_CASE(Mobilenet_Blocked16,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 8, 19, 33,  56, 3, 3, 1, 1, 2, 2,  56, 3, 3, 1, 1, 1, 1), // 1_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 32, 19, 33,  56, 1, 1, 0, 0, 1, 1,  56, 3, 3, 1, 1, 2, 2), // 2_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 56, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 1, 1), // 2_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 112, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 2, 2), // 3_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 112, 4, 8,  208, 1, 1, 0, 0, 1, 1,  208, 3, 3, 1, 1, 1, 1),  // 3_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 208, 4, 8,  216, 1, 1, 0, 0, 1, 1,  216, 3, 3, 1, 1, 2, 2),  // 4_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 216, 2, 4,  328, 1, 1, 0, 0, 1, 1,  328, 3, 3, 1, 1, 1, 1),  // 4_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 328, 2, 4,  288, 1, 1, 0, 0, 1, 1,  288, 3, 3, 1, 1, 1, 1),  // 5_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 288, 2, 4,  288, 1, 1, 0, 0, 1, 1,  288, 3, 3, 1, 1, 1, 1),  // 5_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 288, 2, 4,  240, 1, 1, 0, 0, 1, 1,  240, 3, 3, 1, 1, 1, 1),  // 5_3
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 240, 2, 4,  264, 1, 1, 0, 0, 1, 1,  264, 3, 3, 1, 1, 1, 1)   // 5_4
+);
+
 }
index b84f715..2f8f9ae 100644 (file)
@@ -306,13 +306,18 @@ void compute_ref_dw_conv_fwd(const test_binary_convolution_dw_conv_params_t &p,
 }
 
 void compute_ref_binarization_fwd(const test_binary_convolution_dw_conv_params_t &p,
-    const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) {
+    const memory::desc &src_md, const memory &src,
+    const memory &weights, const memory &output_low, const memory &output_high, const memory &dst) {
     auto src_data = (float*)src.get_data_handle();
     auto weights_data = (float*)weights.get_data_handle();
+    auto output_low_data = (float*)output_low.get_data_handle();
+    auto output_high_data = (float*)output_high.get_data_handle();
     auto dst_data = (uint8_t*)dst.get_data_handle();
 
     const memory::desc src_d = src.get_primitive_desc().desc();
     const memory::desc weights_d = weights.get_primitive_desc().desc();
+    const memory::desc output_low_d = output_low.get_primitive_desc().desc();
+    const memory::desc output_high_d = output_high.get_primitive_desc().desc();
     const memory::desc dst_d = dst.get_primitive_desc().desc();
 
     int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1;
@@ -338,8 +343,10 @@ void compute_ref_binarization_fwd(const test_binary_convolution_dw_conv_params_t
 
                         float s_val = src_data[map_index(src_d, src_idx)];
                         float w_val = weights_data[map_index(weights_d, wei_idx)];
+                        float out_low = output_low_data[map_index(output_low_d, wei_idx)];
+                        float out_high = output_high_data[map_index(output_high_d, wei_idx)];
 
-                        auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00);
+                        auto bit = uint8_t((s_val > w_val) ? out_high : out_low);
                         bin_val |= (bit << shift);
                     }
 
@@ -467,11 +474,33 @@ protected:
         auto dw_conv_binarization_weights_desc = create_md({ cd.conv2_oc }, memory::data_type::f32, memory::x);
         auto dw_conv_binarization_weights = memory({dw_conv_binarization_weights_desc, eng});
 
+        auto dw_conv_binarization_output_low_desc = create_md({ cd.conv2_oc }, memory::data_type::f32, memory::x);
+        auto dw_conv_binarization_output_low = memory({dw_conv_binarization_output_low_desc, eng});
+
+        auto dw_conv_binarization_output_high_desc = create_md({ cd.conv2_oc }, memory::data_type::f32, memory::x);
+        auto dw_conv_binarization_output_high = memory({dw_conv_binarization_output_high_desc, eng});
+
+        auto dw_conv_binarization_output_mask_desc = create_md({ cd.conv2_oc }, memory::data_type::f32, memory::x);
+        auto dw_conv_binarization_output_mask = memory({dw_conv_binarization_output_mask_desc, eng});
+
         if (p.binarization_algorithm != algorithm_undef) {
             fill_data<float>(dw_conv_binarization_weights.get_primitive_desc().get_size() / sizeof(float),
                              (float *)dw_conv_binarization_weights.get_data_handle(), 0.f, p.sizes.conv2_oc * p.sizes.conv2_kh * p.sizes.conv2_kw);
 
-            bin_conv_post_ops.append_binarization(p.binarization_algorithm, static_cast<const float*>(dw_conv_binarization_weights.get_data_handle()));
+            fill_data<float>(dw_conv_binarization_output_low.get_primitive_desc().get_size() / sizeof(float),
+                             (float *)dw_conv_binarization_output_low.get_data_handle(), 0.f, 1.f);
+
+            float* p_output_low = (float *)dw_conv_binarization_output_low.get_data_handle();
+            float* p_output_high = (float *)dw_conv_binarization_output_high.get_data_handle();
+            uint32_t* p_output_mask = (uint32_t *)dw_conv_binarization_output_mask.get_data_handle();
+            for (int i = 0; i < cd.conv2_oc; i++) {
+                p_output_low[i] = p_output_low[i] >= 0 ? 1 : 0;
+                p_output_high[i] = p_output_low[i] == 1 ? 0 : 1;
+                p_output_mask[i] = p_output_high[i] == 1 ? 0xffffffff : 0x00000000;
+            }
+
+            bin_conv_post_ops.append_binarization(p.binarization_algorithm, static_cast<const float*>(dw_conv_binarization_weights.get_data_handle()),
+                                                                            static_cast<const float*>(dw_conv_binarization_output_mask.get_data_handle()));
         }
 
         mkldnn::primitive_attr bin_conv_attr;
@@ -497,7 +526,8 @@ protected:
 
             auto ref_binarization_dst = test_memory(dw_conv_dst_desc, eng);
 
-            compute_ref_binarization_fwd(p, ref_dw_conv_dst_desc, ref_dw_conv_dst.get(), dw_conv_binarization_weights, ref_binarization_dst.get());
+            compute_ref_binarization_fwd(p, ref_dw_conv_dst_desc, ref_dw_conv_dst.get(), dw_conv_binarization_weights,
+                    dw_conv_binarization_output_low, dw_conv_binarization_output_high, ref_binarization_dst.get());
 
             std::vector<primitive> pipeline;
             pipeline.push_back(bin_conv);
index 7e0bcae..914b9e7 100644 (file)
@@ -64,4 +64,35 @@ INST_TEST_CASE(Mobilenet_Blocked,
            2, 240, 2, 4,  264, 1, 1, 0, 0, 1, 1,  264, 3, 3, 1, 1, 1, 1)   // 5_4
 );
 
+
+INST_TEST_CASE(SimpleSmall_Blocked16,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           1, 7, 10, 10,  37, 1, 1, 0, 0, 1, 1,  37, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(Mobilenet_Blocked16,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 8, 19, 33,  56, 3, 3, 1, 1, 2, 2,  56, 3, 3, 1, 1, 1, 1), // 1_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 32, 19, 33,  56, 1, 1, 0, 0, 1, 1,  56, 3, 3, 1, 1, 2, 2), // 2_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 56, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 1, 1), // 2_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 112, 9, 16,  112, 1, 1, 0, 0, 1, 1,  112, 3, 3, 1, 1, 2, 2), // 3_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 112, 4, 8,  208, 1, 1, 0, 0, 1, 1,  208, 3, 3, 1, 1, 1, 1), // 3_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 208, 4, 8,  216, 1, 1, 0, 0, 1, 1,  216, 3, 3, 1, 1, 2, 2),  // 4_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 216, 2, 4,  328, 1, 1, 0, 0, 1, 1,  328, 3, 3, 1, 1, 1, 1),  // 4_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 328, 2, 4,  288, 1, 1, 0, 0, 1, 1,  288, 3, 3, 1, 1, 1, 1),  // 5_1
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 288, 2, 4,  288, 1, 1, 0, 0, 1, 1,  288, 3, 3, 1, 1, 1, 1),  // 5_2
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 288, 2, 4,  240, 1, 1, 0, 0, 1, 1,  240, 3, 3, 1, 1, 1, 1),  // 5_3
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+           2, 240, 2, 4,  264, 1, 1, 0, 0, 1, 1,  264, 3, 3, 1, 1, 1, 1)   // 5_4
+);
+
 }
index 74dcc03..c8e96d0 100644 (file)
@@ -68,6 +68,28 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
         2, 1, 14, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16_Padded_Channels,
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 5, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 4, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked16_1x1_Padded_Channels,
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 15, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 14, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
+);
+
 //INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
 //    PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
 //        2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
index 0dcc326..c5a6984 100644 (file)
@@ -66,6 +66,40 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
         2, 1, 111, 13, 13, 71, 13, 13, 1, 1, 0, 0, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 5, 3, 3, 1, 1, 0, 0, 1, 1, 0),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 15, 3, 3, 37, 4, 4, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 14, 4, 4, 1, 4, 4, 3, 3, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 33, 3, 3, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 19, 2, 2, 22, 2, 2, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 126, 13, 13, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 77, 13, 13, 99, 11, 11, 3, 3, 0, 0, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked16_1x1_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 13, 13, 35, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 11, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 1, 4, 4, 58, 4, 4, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 27, 3, 3, 33, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 81, 2, 2, 81, 2, 2, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 13, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 111, 13, 13, 71, 13, 13, 1, 1, 0, 0, 1, 1)
+);
+
 //INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
 //    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
 //        2, 126, 126, 10, 10, 126, 10, 10, 3, 3, 1, 1, 1, 1),
index bef6e15..f9127f5 100644 (file)
@@ -169,13 +169,18 @@ void compute_ref_bin_conv_fwd(const test_binary_convolution_params_t &p,
 }
 
 void compute_ref_binarization_fwd(const test_binary_convolution_params_t &p,
-    const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) {
+    const memory::desc &src_md, const memory &src, const memory &weights,
+    const memory &output_low, const memory &output_high, const memory &dst) {
     auto src_data = (float*)src.get_data_handle();
     auto weights_data = (float*)weights.get_data_handle();
+    auto output_low_data = (float*)output_low.get_data_handle();
+    auto output_high_data = (float*)output_high.get_data_handle();
     auto dst_data = (uint8_t*)dst.get_data_handle();
 
     const memory::desc src_d = src.get_primitive_desc().desc();
     const memory::desc weights_d = weights.get_primitive_desc().desc();
+    const memory::desc output_low_d = output_low.get_primitive_desc().desc();
+    const memory::desc output_high_d = output_high.get_primitive_desc().desc();
     const memory::desc dst_d = dst.get_primitive_desc().desc();
 
     int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1;
@@ -201,8 +206,10 @@ void compute_ref_binarization_fwd(const test_binary_convolution_params_t &p,
 
                         float s_val = src_data[map_index(src_d, src_idx)];
                         float w_val = weights_data[map_index(weights_d, wei_idx)];
+                        float out_low = output_low_data[map_index(output_low_d, wei_idx)];
+                        float out_high = output_high_data[map_index(output_high_d, wei_idx)];
 
-                        auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00);
+                        auto bit = uint8_t((s_val > w_val) ? out_high : out_low);
                         bin_val |= (bit << shift);
                     }
 
@@ -292,11 +299,33 @@ protected:
         auto c_binarization_weights_desc = create_md({ cd.oc }, memory::data_type::f32, memory::x);
         auto c_binarization_weights = memory({c_binarization_weights_desc, eng});
 
+        auto c_binarization_output_low_desc = create_md({ cd.oc }, memory::data_type::f32, memory::x);
+        auto c_binarization_output_low = memory({c_binarization_output_low_desc, eng});
+
+        auto c_binarization_output_high_desc = create_md({ cd.oc }, memory::data_type::f32, memory::x);
+        auto c_binarization_output_high = memory({c_binarization_output_high_desc, eng});
+
+        auto c_binarization_output_mask_desc = create_md({ cd.oc }, memory::data_type::f32, memory::x);
+        auto c_binarization_output_mask = memory({c_binarization_output_mask_desc, eng});
+
         if (p.binarization_algorithm != algorithm_undef) {
             fill_data<float>(c_binarization_weights.get_primitive_desc().get_size() / sizeof(float),
                              (float *)c_binarization_weights.get_data_handle(), 1., true);
 
-            ops.append_binarization(p.binarization_algorithm, static_cast<const float*>(c_binarization_weights.get_data_handle()));
+            fill_data<float>(c_binarization_output_low.get_primitive_desc().get_size() / sizeof(float),
+                             (float *)c_binarization_output_low.get_data_handle(), 1., true);
+
+            float* p_output_low = (float *)c_binarization_output_low.get_data_handle();
+            float* p_output_high = (float *)c_binarization_output_high.get_data_handle();
+            uint32_t* p_output_mask = (uint32_t *)c_binarization_output_mask.get_data_handle();
+            for (int i = 0; i < cd.oc; i++) {
+                p_output_low[i] = p_output_low[i] >= 0 ? 1 : 0;
+                p_output_high[i] = p_output_low[i] == 1 ? 0 : 1;
+                p_output_mask[i] = p_output_high[i] == 1 ? 0xffffffff : 0x00000000;
+            }
+
+            ops.append_binarization(p.binarization_algorithm, static_cast<const float*>(c_binarization_weights.get_data_handle()),
+                                                              static_cast<const float*>(c_binarization_output_mask.get_data_handle()));
         }
 
         mkldnn::primitive_attr attr;
@@ -320,7 +349,8 @@ protected:
                                      c_src.get(), c_weights.get(), ref_conv_memory,
                                      c_depthwise_weights, c_depthwise_bias);
 
-            compute_ref_binarization_fwd(p, c_dst_desc_ref, ref_conv_memory, c_binarization_weights, ref_memory);
+            compute_ref_binarization_fwd(p, c_dst_desc_ref, ref_conv_memory, c_binarization_weights,
+                    c_binarization_output_low, c_binarization_output_high, ref_memory);
 
             std::vector<primitive> pipeline;
             pipeline.push_back(bin_conv);
index 1a9a548..d7e003e 100644 (file)
@@ -59,6 +59,32 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
         2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
 );
 
+INST_TEST_CASE(SimpleSmall_Blocked16_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked16_1x1_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1)
+);
+
 //INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
 //    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
 //        2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
index d90717c..2b27602 100644 (file)
@@ -157,4 +157,4 @@ endif()
 endif()
 ## enable jit_gemm from mlk-dnn
 
-target_link_libraries(${TARGET} PRIVATE ${${TARGET}_LINKER_LIBS})
+target_link_libraries(${TARGET} PRIVATE ${${TARGET}_LINKER_LIBS})
\ No newline at end of file
diff --git a/inference-engine/thirdparty/movidius/CMakeLists.txt b/inference-engine/thirdparty/movidius/CMakeLists.txt
new file mode 100644 (file)
index 0000000..caba774
--- /dev/null
@@ -0,0 +1,11 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+add_subdirectory(
+    "${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/XLink"
+    "${CMAKE_BINARY_DIR}/thirdparty/movidius/XLink")
+
+add_subdirectory(
+    "${IE_MAIN_SOURCE_DIR}/thirdparty/movidius/mvnc"
+    "${CMAKE_BINARY_DIR}/thirdparty/movidius/mvnc")
diff --git a/inference-engine/thirdparty/movidius/MovidiusDriver/Movidius_VSC_Device.inf b/inference-engine/thirdparty/movidius/MovidiusDriver/Movidius_VSC_Device.inf
new file mode 100644 (file)
index 0000000..d8b5176
Binary files /dev/null and b/inference-engine/thirdparty/movidius/MovidiusDriver/Movidius_VSC_Device.inf differ
diff --git a/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/WdfCoInstaller01011.dll b/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/WdfCoInstaller01011.dll
new file mode 100644 (file)
index 0000000..f2f7f4a
Binary files /dev/null and b/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/WdfCoInstaller01011.dll differ
diff --git a/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/winusbcoinstaller2.dll b/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/winusbcoinstaller2.dll
new file mode 100644 (file)
index 0000000..d6da866
Binary files /dev/null and b/inference-engine/thirdparty/movidius/MovidiusDriver/amd64/winusbcoinstaller2.dll differ
diff --git a/inference-engine/thirdparty/movidius/MovidiusDriver/movidius_vsc_device.cat b/inference-engine/thirdparty/movidius/MovidiusDriver/movidius_vsc_device.cat
new file mode 100644 (file)
index 0000000..8029508
Binary files /dev/null and b/inference-engine/thirdparty/movidius/MovidiusDriver/movidius_vsc_device.cat differ
diff --git a/inference-engine/thirdparty/movidius/USB_WIN/gettime.c b/inference-engine/thirdparty/movidius/USB_WIN/gettime.c
new file mode 100644 (file)
index 0000000..03d3d1b
--- /dev/null
@@ -0,0 +1,31 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+//#include "windows.h"
+#include <time.h>
+#include "gettime.h"
+
+int clock_gettime(int dummy, struct timespec *spec)
+{
+       dummy;  // for unreferenced formal parameter warning
+       __int64 wintime; GetSystemTimeAsFileTime((FILETIME*)&wintime);
+       wintime -= 116444736000000000i64;  //1jan1601 to 1jan1970
+       spec->tv_sec = wintime / 10000000i64;           //seconds
+       spec->tv_nsec = wintime % 10000000i64 * 100;      //nano-seconds
+       return 0;
+}
diff --git a/inference-engine/thirdparty/movidius/USB_WIN/gettime.h b/inference-engine/thirdparty/movidius/USB_WIN/gettime.h
new file mode 100644 (file)
index 0000000..a8bd958
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma once
+#include "windows.h"
+#define CLOCK_REALTIME         0
+#define CLOCK_MONOTONIC                0
+#define sleep(x)                       Sleep((DWORD)x)
+#define usleep(x)                      Sleep((DWORD)(x/1000))
+
+
+int clock_gettime(int, struct timespec *);
+#ifdef __cplusplus
+}
+#endif
diff --git a/inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.c b/inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.c
new file mode 100644 (file)
index 0000000..e11a4a6
--- /dev/null
@@ -0,0 +1,532 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+#pragma comment(lib, "winusb.lib")
+#pragma comment(lib, "setupapi.lib")
+//#define _CRT_SECURE_NO_WARNINGS
+
+#define INITGUID
+#include <Windows.h>
+#include <winusb.h>
+#include <Usbiodef.h>
+#include <SetupAPI.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "usb_winusb.h"
+
+#define USB_DIR_OUT            0
+#define USB_DIR_IN             1
+
+#define USB_DEV_NONE   NULL
+#define USB_HAN_NONE   NULL
+
+#define USB_ERR_NONE           0
+#define USB_ERR_TIMEOUT                -1
+#define USB_ERR_FAILED         -2
+#define USB_ERR_INVALID                -3
+
+
+
+///*
+struct ep_info {
+       uint8_t ep;
+       size_t sz;
+       ULONG last_timeout;
+};
+struct _usb_han {
+       HANDLE devHan;
+       WINUSB_INTERFACE_HANDLE winUsbHan;
+       struct ep_info eps[2];
+};
+
+extern const char * usb_get_pid_name(int);
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+// Myriad 2: {19E08104-0543-40A5-B107-87EF463DCEF1}
+DEFINE_GUID(GUID_DEVINTERFACE_Myriad2, 0x19e08104, 0x0543, 0x40a5,
+       0xb1, 0x07, 0x87, 0xef, 0x46, 0x3d, 0xce, 0xf1);
+
+// Myriad X: {504E1220-E189-413A-BDEC-ECFFAF3A3731}
+DEFINE_GUID(GUID_DEVINTERFACE_MyriadX, 0x504e1220, 0xe189, 0x413a,
+       0xbd, 0xec, 0xec, 0xff, 0xaf, 0x3a, 0x37, 0x31);
+
+static FILE *msgfile = NULL;
+static int verbose = 0, ignore_errors = 0;
+static DWORD last_bulk_errcode = 0;
+static char *errmsg_buff = NULL;
+static size_t errmsg_buff_len = 0;
+
+static const char *format_win32_msg(DWORD errId) {
+       while(!FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+               NULL, errId, 0, errmsg_buff, (DWORD)errmsg_buff_len, NULL)) {
+               if(GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+err_fail:
+                       snprintf(errmsg_buff, errmsg_buff_len, "Win32 Error 0x%08lx (Unable to retrieve error message)", errId);
+                       return errmsg_buff;
+               }
+               size_t nlen = errmsg_buff_len + (errmsg_buff_len / 2);
+               if(nlen > 1024)
+                       goto err_fail;
+               char *nbuff = realloc(errmsg_buff, nlen);
+               if(nbuff == NULL)
+                       goto err_fail;
+               errmsg_buff = nbuff;
+               errmsg_buff_len = nlen;
+       }
+       return errmsg_buff;
+}
+
+static void wperror(const char *errmsg) {
+       DWORD errId = GetLastError();
+       fprintf(stderr, "%s: System err %d\n", errmsg, errId);
+}
+
+static void wstrerror(char *buff, const char *errmsg) {
+       DWORD errId = GetLastError();
+       snprintf(buff,strlen(buff), "%s: %s\n", errmsg, format_win32_msg(errId));
+}
+const char* libusb_strerror(int x)
+{
+       return format_win32_msg(x);
+}
+int usb_init(void) {
+       msgfile = stdout;
+       if(errmsg_buff == NULL) {
+               errmsg_buff_len = 64;
+               errmsg_buff = malloc(errmsg_buff_len);
+               if(errmsg_buff == NULL) {
+                       perror("malloc");
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+void usb_shutdown(void) {
+       if(errmsg_buff != NULL) {
+               free(errmsg_buff);
+               errmsg_buff = NULL;
+       }
+}
+
+int usb_can_find_by_guid(void) {
+       return 1;
+}
+
+static usb_dev retreive_dev_path(HDEVINFO devInfo, SP_DEVICE_INTERFACE_DATA *ifaceData) {
+       usb_dev res;
+       PSP_DEVICE_INTERFACE_DETAIL_DATA detData;
+       ULONG len, reqLen;
+
+       if(!SetupDiGetDeviceInterfaceDetail(devInfo, ifaceData, NULL, 0, &reqLen, NULL)) {
+               if(GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+                       wperror("SetupDiEnumDeviceInterfaces");
+                       SetupDiDestroyDeviceInfoList(devInfo);
+                       return USB_DEV_NONE;
+               }
+       }
+       detData = (PSP_DEVICE_INTERFACE_DETAIL_DATA)_alloca(reqLen);
+       detData->cbSize = sizeof(*detData);
+       len = reqLen;
+       if(!SetupDiGetDeviceInterfaceDetail(devInfo, ifaceData, detData, len, &reqLen, NULL)) {
+               wperror("SetupDiGetDeviceInterfaceDetail");
+               SetupDiDestroyDeviceInfoList(devInfo);
+               return USB_DEV_NONE;
+       }
+       res = _strdup(detData->DevicePath);
+       if(res == NULL) {
+               perror("strdup");
+       }
+       SetupDiDestroyDeviceInfoList(devInfo);
+       return res;
+}
+
+static const char *gen_addr(HDEVINFO devInfo, SP_DEVINFO_DATA *devInfoData, uint16_t pid) {
+    static char buff[16];
+    char li_buff[128];
+    unsigned int port, hub;
+    if (!SetupDiGetDeviceRegistryProperty(devInfo, devInfoData, SPDRP_LOCATION_INFORMATION, NULL, li_buff, sizeof(li_buff), NULL))
+    {
+        goto ret_err;
+    }
+       if(sscanf(li_buff, "Port_#%u.Hub_#%u", &port, &hub) != 2)
+        goto ret_err;
+
+       //matching it to libusboutput
+       const char* dev_name = usb_get_pid_name(pid);
+       if(dev_name == NULL)
+               goto ret_err;
+
+       snprintf(buff, sizeof(buff), "%u.%u-%s", hub, port, dev_name);
+    buff[sizeof(buff) - 1] = '\0';
+       return buff;
+ret_err:
+    return "<error>";
+}
+extern DEFAULT_OPENPID;
+
+static int compareDeviceByHubAndPort(const void *l, const void *r) {
+    int lHub = 0, lPort = 0;
+    int rHub = 0, rPort = 0;
+
+    if (sscanf(((const char *)l + 4), "%d.%d", &lHub, &lPort) == EOF) {
+        perror("Can not parse hub and port of the devices");
+    };
+    if (sscanf(((const char *)r + 4), "%d.%d", &rHub, &rPort) == EOF) {
+        perror("Can not parse hub and port of the devices");
+    }
+
+    if (lHub != rHub) {
+        return rHub - lHub;
+    }
+
+    return rPort - lPort;
+}
+
+int usb_list_devices(uint16_t vid, uint16_t pid, uint8_t dev_des[][2 + 2 + 4 * 7 + 7]) {
+       HDEVINFO devInfo;
+       static int i;
+       SP_DEVINFO_DATA devInfoData;
+       char hwid_buff[128];
+
+       devInfoData.cbSize = sizeof(devInfoData);
+
+       devInfo = SetupDiGetClassDevs(&GUID_DEVINTERFACE_USB_DEVICE, NULL, NULL, DIGCF_PRESENT | DIGCF_DEVICEINTERFACE);
+       if(devInfo == INVALID_HANDLE_VALUE) {
+               wperror("SetupDiGetClassDevs");
+               return -1;
+       }
+
+    for (i=0; SetupDiEnumDeviceInfo(devInfo, i, &devInfoData); i++) {
+        if (!SetupDiGetDeviceRegistryProperty(devInfo, &devInfoData, SPDRP_HARDWAREID, NULL, hwid_buff, sizeof(hwid_buff), NULL)) {
+            continue;
+        }
+        uint16_t fvid, fpid;
+        if(sscanf(hwid_buff, "USB\\VID_%hx&PID_%hx", (int16_t *)&fvid, (int16_t *)&fpid) != 2) {
+            continue;
+        }
+
+        dev_des[i][0] = ((fvid & 0xFF00)>>8);
+        dev_des[i][1] = ((fvid & 0x00FF) >> 0);
+        dev_des[i][2] = ((fpid & 0xFF00) >> 8);
+        dev_des[i][3] = ((fpid & 0x00FF) >> 0);
+        sprintf((char *)&dev_des[i][4], "%s", gen_addr(devInfo, &devInfoData, fpid));
+    }
+    SetupDiDestroyDeviceInfoList(devInfo);
+
+    qsort(dev_des, i, sizeof(dev_des[0]), compareDeviceByHubAndPort);
+
+    return i;
+}
+
+void * enumerate_usb_device(uint16_t vid, uint16_t pid, const char *addr, int loud) {
+       HDEVINFO devInfo;
+       SP_DEVICE_INTERFACE_DATA ifaceData;
+       int i;
+       SP_DEVINFO_DATA devInfoData;
+       char hwid_buff[128];
+       int found, found_ind = -1;
+       const char *caddr;
+
+       devInfoData.cbSize = sizeof(devInfoData);
+
+       devInfo = SetupDiGetClassDevs(&GUID_DEVINTERFACE_USB_DEVICE, NULL, NULL, DIGCF_PRESENT | DIGCF_DEVICEINTERFACE);
+       if(devInfo == INVALID_HANDLE_VALUE) {
+               wperror("SetupDiGetClassDevs");
+               return USB_DEV_NONE;
+       }
+       found = 0;
+       for(i=0; SetupDiEnumDeviceInfo(devInfo, i, &devInfoData); i++) {
+               if(!SetupDiGetDeviceRegistryProperty(devInfo, &devInfoData, SPDRP_HARDWAREID, NULL, hwid_buff, sizeof(hwid_buff), NULL))
+                       continue;
+               uint16_t fvid, fpid;
+        if(sscanf(hwid_buff, "USB\\VID_%hx&PID_%hx", (int16_t*)&fvid, (int16_t*)&fpid) != 2)
+                       continue;
+               if(verbose && loud)
+                       fprintf(msgfile, "Vendor/Product ID: %04x:%04x\n", fvid, fpid);
+               if((fvid == vid) && (fpid == pid)) {
+                       caddr = gen_addr(devInfo, &devInfoData, fpid);
+                       if((addr == NULL) || !strcmp(caddr, addr)) {
+                               if(verbose)
+                                       fprintf(msgfile, "Found device with VID/PID %04x:%04x , address %s\n", vid, pid, caddr);
+                               if(!found) {
+                                       found_ind = i;
+                                       found = 1;
+                               }
+                               if(!(verbose && loud))
+                                       break;
+                       }
+               }
+       }
+       if(!found) {
+               SetupDiDestroyDeviceInfoList(devInfo);
+               return USB_DEV_NONE;
+       }
+       if(verbose && loud) {
+               if(!SetupDiEnumDeviceInfo(devInfo, found_ind, &devInfoData)) {
+                       wperror("SetupDiEnumDeviceInfo");
+                       SetupDiDestroyDeviceInfoList(devInfo);
+                       return USB_DEV_NONE;
+               }
+       }
+       ifaceData.cbSize = sizeof(ifaceData);
+       if(!SetupDiEnumDeviceInterfaces(devInfo, &devInfoData, &GUID_DEVINTERFACE_USB_DEVICE, 0, &ifaceData)) {
+               if(GetLastError() != ERROR_NO_MORE_ITEMS) {
+                       wperror("SetupDiEnumDeviceInterfaces");
+               }
+               SetupDiDestroyDeviceInfoList(devInfo);
+               return USB_DEV_NONE;
+       }
+       return retreive_dev_path(devInfo, &ifaceData);
+}
+
+usb_dev findDeviceByGUID(GUID guid, int loud)
+{
+       HDEVINFO devInfo;
+       SP_DEVICE_INTERFACE_DATA ifaceData;
+
+       devInfo = SetupDiGetClassDevs(&guid, NULL, NULL, DIGCF_PRESENT | DIGCF_DEVICEINTERFACE);
+       if (devInfo == INVALID_HANDLE_VALUE) {
+               wperror("SetupDiGetClassDevs");
+               return USB_DEV_NONE;
+       }
+       ifaceData.cbSize = sizeof(ifaceData);
+       if (!SetupDiEnumDeviceInterfaces(devInfo, NULL, &guid, 0, &ifaceData)) {
+               if (GetLastError() != ERROR_NO_MORE_ITEMS) {
+                       wperror("SetupDiEnumDeviceInterfaces");
+               }
+               SetupDiDestroyDeviceInfoList(devInfo);
+               return USB_DEV_NONE;
+       }
+       return retreive_dev_path(devInfo, &ifaceData);
+}
+
+void * usb_find_device_by_guid(int loud) {
+       void *dev = USB_DEV_NONE;
+       //Try Myriad 2
+       dev = findDeviceByGUID(GUID_DEVINTERFACE_Myriad2, loud);
+       if (dev == USB_DEV_NONE)
+       {
+               //Try Myriad X
+               dev = findDeviceByGUID(GUID_DEVINTERFACE_MyriadX, loud);
+       }
+       return dev;
+}
+
+int usb_check_connected(usb_dev dev) {
+       HANDLE han;
+       if(dev == USB_DEV_NONE)
+               return 0;
+       han = CreateFile(dev, 0, FILE_SHARE_WRITE | FILE_SHARE_READ,
+               NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, NULL);
+       if(han == INVALID_HANDLE_VALUE)
+               return 0;
+       CloseHandle(han);
+       return 1;
+}
+
+void * usb_open_device(usb_dev dev, uint8_t *ep, uint8_t intfaceno, char *err_string_buff) {
+       HANDLE devHan = INVALID_HANDLE_VALUE;
+       WINUSB_INTERFACE_HANDLE winUsbHan = INVALID_HANDLE_VALUE;
+       USB_INTERFACE_DESCRIPTOR ifaceDesc;
+       WINUSB_PIPE_INFORMATION pipeInfo;
+       usb_hwnd han = NULL;
+       int i;
+
+       if(dev == USB_DEV_NONE)
+               return USB_HAN_NONE;
+
+       devHan = CreateFile(dev, GENERIC_WRITE | GENERIC_READ, FILE_SHARE_WRITE | FILE_SHARE_READ,
+               NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, NULL);
+       if(devHan == INVALID_HANDLE_VALUE) {
+               if(err_string_buff != NULL)
+                       wstrerror(err_string_buff, "CreateFile");
+               goto exit_err;
+       }
+
+       if(!WinUsb_Initialize(devHan, &winUsbHan)) {
+               if (err_string_buff != NULL)
+                       wstrerror(err_string_buff, "WinUsb_Initialize");
+               goto exit_err;
+       }
+
+       if(!WinUsb_QueryInterfaceSettings(winUsbHan, 0, &ifaceDesc)) {
+               if (err_string_buff != NULL)
+                       wstrerror(err_string_buff, "WinUsb_QueryInterfaceSettings");
+               goto exit_err;
+       }
+
+       han = calloc(1, sizeof(*han));
+       if(han == NULL) {
+               strcpy(err_string_buff, _strerror("malloc"));
+               goto exit_err;
+       }
+       han->devHan = devHan;
+       han->winUsbHan = winUsbHan;
+
+       for(i=0; i<ifaceDesc.bNumEndpoints; i++) {
+               if(!WinUsb_QueryPipe(winUsbHan, 0, i, &pipeInfo)) {
+                       if (err_string_buff != NULL)
+                               wstrerror(err_string_buff, "WinUsb_QueryPipe");
+                       if(!ignore_errors)
+                               goto exit_err;
+               }
+               if(verbose) {
+                       fprintf(msgfile, "Found EP 0x%02x : max packet size is %u bytes\n",
+                               pipeInfo.PipeId, pipeInfo.MaximumPacketSize);
+               }
+               if(pipeInfo.PipeType != UsbdPipeTypeBulk)
+                       continue;
+               int ind = USB_ENDPOINT_DIRECTION_IN(pipeInfo.PipeId) ? USB_DIR_IN : USB_DIR_OUT;
+               han->eps[ind].ep = pipeInfo.PipeId;
+               han->eps[ind].sz = pipeInfo.MaximumPacketSize;
+               han->eps[ind].last_timeout = 0;
+       }
+       if(ep)
+               *ep = han->eps[USB_DIR_OUT].ep;
+
+       if(err_string_buff && (han->eps[USB_DIR_IN].ep == 0)) {
+               sprintf(err_string_buff, "Unable to find BULK IN endpoint\n");
+               goto exit_err;
+       }
+       if(err_string_buff && (han->eps[USB_DIR_OUT].ep == 0)) {
+               sprintf(err_string_buff, "Unable to find BULK OUT endpoint\n");
+               goto exit_err;
+       }
+       if(err_string_buff && (han->eps[USB_DIR_IN].sz == 0)) {
+               sprintf(err_string_buff, "Unable to find BULK IN endpoint size\n");
+               goto exit_err;
+       }
+       if(err_string_buff && (han->eps[USB_DIR_OUT].sz == 0)) {
+               sprintf(err_string_buff, "Unable to find BULK OUT endpoint size\n");
+               goto exit_err;
+       }
+       return han;
+exit_err:
+       usb_close_device(han);
+       return USB_HAN_NONE;
+}
+
+uint8_t usb_get_bulk_endpoint(usb_hwnd han, int dir) {
+       if((han == NULL) || ((dir != USB_DIR_OUT) && (dir != USB_DIR_IN)))
+               return 0;
+       return han->eps[dir].ep;
+}
+
+size_t usb_get_endpoint_size(usb_hwnd han, uint8_t ep) {
+       if(han == NULL)
+               return 0;
+       if(han->eps[USB_DIR_OUT].ep == ep)
+               return han->eps[USB_DIR_OUT].sz;
+       if(han->eps[USB_DIR_IN].ep == ep)
+               return han->eps[USB_DIR_IN].sz;
+       return 0;
+}
+
+int usb_bulk_write(usb_hwnd han, uint8_t ep, const void *buffer, size_t sz, uint32_t *wrote_bytes, int timeout_ms) {
+       ULONG wb = 0;
+       if(wrote_bytes != NULL)
+               *wrote_bytes = 0;
+       if(han == NULL)
+               return USB_ERR_INVALID;
+
+       if(timeout_ms != han->eps[USB_DIR_OUT].last_timeout) {
+               han->eps[USB_DIR_OUT].last_timeout = timeout_ms;
+               if(!WinUsb_SetPipePolicy(han->winUsbHan, ep, PIPE_TRANSFER_TIMEOUT,
+                       sizeof(ULONG), &han->eps[USB_DIR_OUT].last_timeout)) {
+                       last_bulk_errcode = GetLastError();
+                       wperror("WinUsb_SetPipePolicy");
+                       return USB_ERR_FAILED;
+               }
+       }
+       if(!WinUsb_WritePipe(han->winUsbHan, ep, (PUCHAR)buffer, (ULONG)sz, &wb, NULL)) {
+               last_bulk_errcode = GetLastError();
+               if(last_bulk_errcode == ERROR_SEM_TIMEOUT)
+                       return USB_ERR_TIMEOUT;
+               wperror("WinUsb_WritePipe");
+               printf("\nWinUsb_WritePipe failed with error:=%d\n", GetLastError());
+               return USB_ERR_FAILED;
+       }
+       last_bulk_errcode = 0;
+       if(wrote_bytes != NULL)
+               *wrote_bytes = wb;
+       return USB_ERR_NONE;
+}
+
+int usb_bulk_read(usb_hwnd han, uint8_t ep, void *buffer, size_t sz, uint32_t *read_bytes, int timeout_ms) {
+       ULONG rb = 0;
+       if(read_bytes != NULL)
+               *read_bytes = 0;
+       if(han == NULL)
+               return USB_ERR_INVALID;
+
+       if(timeout_ms != han->eps[USB_DIR_IN].last_timeout) {
+               han->eps[USB_DIR_IN].last_timeout = timeout_ms;
+               if(!WinUsb_SetPipePolicy(han->winUsbHan, ep, PIPE_TRANSFER_TIMEOUT,
+                       sizeof(ULONG), &han->eps[USB_DIR_IN].last_timeout)) {
+                       last_bulk_errcode = GetLastError();
+                       wperror("WinUsb_SetPipePolicy");
+                       return USB_ERR_FAILED;
+               }
+       }
+       if(sz == 0)
+               return USB_ERR_NONE;
+       if(!WinUsb_ReadPipe(han->winUsbHan, ep, buffer, (ULONG)sz, &rb, NULL)) {
+               last_bulk_errcode = GetLastError();
+               if(last_bulk_errcode == ERROR_SEM_TIMEOUT)
+                       return USB_ERR_TIMEOUT;
+               wperror("WinUsb_ReadPipe");
+               return USB_ERR_FAILED;
+       }
+       last_bulk_errcode = 0;
+       if(read_bytes != NULL)
+               *read_bytes = rb;
+       return USB_ERR_NONE;
+}
+
+void usb_free_device(usb_dev dev) {
+       if(dev != NULL)
+               free(dev);
+}
+
+void usb_close_device(usb_hwnd han) {
+       if(han == NULL)
+               return;
+       WinUsb_Free(han->winUsbHan);
+       CloseHandle(han->devHan);
+       free(han);
+}
+
+const char *usb_last_bulk_errmsg(void) {
+       return format_win32_msg(last_bulk_errcode);
+}
+
+void usb_set_msgfile(FILE *file) {
+       msgfile = file;
+}
+
+void usb_set_verbose(int value) {
+       verbose = value;
+}
+
+void usb_set_ignoreerrors(int value) {
+       ignore_errors = value;
+}
diff --git a/inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.h b/inference-engine/thirdparty/movidius/USB_WIN/usb_winusb.h
new file mode 100644 (file)
index 0000000..ae5adc3
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+// Common USB API
+
+#ifndef _USB_COMMON_H
+#define _USB_COMMON_H
+
+//#include <stdlib.h>
+//#include <stdint.h>
+//#include <stdio.h>
+
+#ifndef _USB_SOURCE
+typedef void *usb_dev;
+typedef void *usb_han;
+#endif
+
+enum
+{
+       LIBUSB_ERROR_NO_DEVICE = -4,
+       LIBUSB_ERROR_TIMEOUT = -7
+};
+
+
+typedef void libusb_device;
+typedef struct _usb_han *usb_hwnd;
+typedef struct _usb_han libusb_device_handle;
+extern int usb_init(void);
+extern void usb_shutdown(void);
+
+extern int usb_can_find_by_guid(void);
+extern int usb_list_devices(uint16_t vid, uint16_t pid, uint8_t dev_des[][2 + 2 + 4 * 7 + 7]);
+extern void *  enumerate_usb_device(uint16_t vid, uint16_t pid, const char *addr, int loud);
+extern void *  usb_find_device_by_guid(int loud);
+extern int usb_check_connected(usb_dev dev);
+extern void * usb_open_device(usb_dev dev, uint8_t *ep, uint8_t intfaceno, char *err_string_buff);
+extern uint8_t usb_get_bulk_endpoint(usb_han han, int dir);
+extern size_t usb_get_endpoint_size(usb_han han, uint8_t ep);
+extern int usb_bulk_write(usb_han han, uint8_t ep, const void *buffer, size_t sz, uint32_t *wrote_bytes, int timeout_ms);
+extern int usb_bulk_read(usb_han han, uint8_t ep, void *buffer, size_t sz, uint32_t *read_bytes, int timeout_ms);
+extern void usb_free_device(usb_dev dev);
+extern void usb_close_device(usb_han han);
+
+extern const char *usb_last_bulk_errmsg(void);
+extern void usb_set_msgfile(FILE *file);
+extern void usb_set_verbose(int value);
+extern void usb_set_ignoreerrors(int value);
+
+extern const char* libusb_strerror(int x);
+
+#endif//_USB_COMMON_H
diff --git a/inference-engine/thirdparty/movidius/WinPthread/win_pthread.c b/inference-engine/thirdparty/movidius/WinPthread/win_pthread.c
new file mode 100644 (file)
index 0000000..94533b4
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#include "win_pthread.h"
+
+
+//Mutex implementation
+int pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+int pthread_mutex_init(pthread_mutex_t *mutex,
+                    pthread_mutexattr_t *attr)
+{
+    InitializeCriticalSection(mutex);
+
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t *mutex)
+{
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+//State implementation
+unsigned _pthread_get_state(pthread_attr_t *attr, unsigned flag)
+{
+    return attr->pthread_state & flag;
+}
+
+int _pthread_set_state(pthread_attr_t *attr, unsigned flag, unsigned val)
+{
+    if (~flag & val) return EINVAL;
+    attr->pthread_state &= ~flag;
+    attr->pthread_state |= val;
+
+    return 0;
+}
+
+//Attribute implementation
+int pthread_attr_init(pthread_attr_t *attr)
+{
+    attr->pthread_state = PTHREAD_CANCEL_ENABLE;
+    attr->stack = NULL;
+    attr->stack_size = 0;
+    return 0;
+}
+
+int pthread_attr_destroy(pthread_attr_t *attr)
+{
+    return 0;
+}
+
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int flag)
+{
+    return _pthread_set_state(attr, PTHREAD_INHERIT_SCHED, flag);
+}
+
+int pthread_attr_getinheritsched(pthread_attr_t *attr, int *flag)
+{
+    *flag = _pthread_get_state(attr, PTHREAD_INHERIT_SCHED);
+    return 0;
+}
+
+#define pthread_attr_getschedpolicy(ATTR, POLICY) ENOTSUP
+#define pthread_attr_setschedpolicy(ATTR, POLICY) ENOTSUP
+
+
+//Pthread creation
+ unsigned int __stdcall _pthread_start_routine(void *args)
+{
+    pthread_t *thread = args;
+    thread->tid = GetCurrentThreadId();
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int pthread_create(pthread_t *thread, pthread_attr_t *attr,
+                    void *(*start_routine)(void *), void *arg)
+{
+    unsigned stack_size = 0;
+
+    /* Save data in pthread_t */
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->pthread_state = PTHREAD_CANCEL_ENABLE;
+    thread->handle = (HANDLE)-1;
+    _ReadWriteBarrier();
+
+    if (attr)
+    {
+        thread->pthread_state = attr->pthread_state;
+        stack_size = attr->stack_size;
+    }
+
+    thread->handle = (HANDLE)_beginthreadex((void *)NULL, stack_size, _pthread_start_routine, thread, 0, NULL);
+
+    /* Failed */
+    if (!thread->handle)
+        return 1;
+
+    return 0;
+}
+
+int pthread_detach(pthread_t thread)
+{
+    CloseHandle(thread.handle);
+    _ReadWriteBarrier();
+    thread.handle = 0;
+
+    return 0;
+}
+
+int _pthread_join(pthread_t *thread, void **res)
+{
+
+    DWORD result = WaitForSingleObject(thread->handle, INFINITE);
+
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (res)
+            *res = thread->arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return 1;
+    }
+}
+
+pthread_t pthread_self(void)
+{
+    pthread_t t = { 0 };
+
+    t.tid = GetCurrentThreadId();
+    t.arg = NULL;
+    t.start_routine = NULL;
+    t.pthread_state = PTHREAD_CANCEL_ENABLE;
+    t.handle = GetCurrentThread();
+
+    return t;
+}
+
+void pthread_exit(void *res)
+{
+    if(res)
+    {
+        _endthreadex(*(int *)res);
+    }
+    else
+        _endthreadex(0);
+}
+
diff --git a/inference-engine/thirdparty/movidius/WinPthread/win_pthread.h b/inference-engine/thirdparty/movidius/WinPthread/win_pthread.h
new file mode 100644 (file)
index 0000000..03a8d05
--- /dev/null
@@ -0,0 +1,109 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#ifndef WIN_PTHREADS
+#define WIN_PTHREADS
+
+#include <windows.h>
+#include <setjmp.h>
+#include <errno.h>
+#include <sys/timeb.h>
+#include <process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef ETIMEDOUT
+#define ETIMEDOUT   110
+#define ENOTSUP     134
+#endif
+
+//State related definition
+#define PTHREAD_CANCEL_DISABLE 0
+#define PTHREAD_CANCEL_ENABLE 0x01
+#define PTHREAD_CREATE_DETACHED 0x04
+#define PTHREAD_INHERIT_SCHED 0x08
+
+
+//Mutex related definition
+
+#define PTHREAD_MUTEX_INITIALIZER {(RTL_CRITICAL_SECTION_DEBUG*)-1,-1,0,0,0,0}
+
+
+#if (_MSC_VER == 1800)
+struct timespec
+{
+    /* long long in windows is the same as long in unix for 64bit */
+    long long tv_sec;
+    long long tv_nsec;
+};
+#elif (_MSC_VER >= 1800)
+#include "time.h"
+#endif
+
+typedef struct
+{
+    HANDLE handle;
+    void *arg;
+    void *(*start_routine)(void *);
+    DWORD tid;
+    unsigned pthread_state;
+}pthread_t;
+
+typedef struct
+{
+    unsigned pthread_state;
+    void *stack;
+    size_t stack_size;
+}pthread_attr_t;
+
+
+typedef unsigned pthread_mutexattr_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+int pthread_mutex_lock(pthread_mutex_t *mutexm);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+
+
+unsigned _pthread_get_state(pthread_attr_t *attr, unsigned flag);
+int _pthread_set_state(pthread_attr_t *attr, unsigned flag, unsigned val);
+
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int flag);
+int pthread_attr_getinheritsched(pthread_attr_t *attr, int *flag);
+int pthread_attr_destroy(pthread_attr_t *attr);
+
+#define pthread_equal(t1, t2) ((t1).tid == (t2).tid)
+unsigned int __stdcall _pthread_start_routine(void *args);
+int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*func)(void *), void *arg);
+#define pthread_join(a, b) _pthread_join(&(a), (b))
+int _pthread_join(pthread_t *thread, void **res);
+pthread_t pthread_self(void);
+void pthread_exit(void *res);
+
+int pthread_detach(pthread_t thread);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* WIN_PTHREADS */
\ No newline at end of file
diff --git a/inference-engine/thirdparty/movidius/WinPthread/win_semaphore.c b/inference-engine/thirdparty/movidius/WinPthread/win_semaphore.c
new file mode 100644 (file)
index 0000000..ded50fd
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+//lightweight semaphone wrapper
+
+#include "win_semaphore.h"
+#include "gettime.h"
+
+
+static int ls_set_errno(int result){
+       if (result != 0) {
+               errno = result;
+               return -1;
+               }
+
+       return 0;
+}
+
+
+//Create an semaphore.
+int sem_init(sem_t *sem, int pshared, unsigned int value){
+       sem_t s = NULL;
+
+       if (sem == NULL || value > (unsigned int) SEM_VALUE_MAX){
+               return ls_set_errno(EINVAL);
+       }
+
+       if (NULL == (s = (sem_t *)calloc(1, sizeof(*s)))){
+               return ls_set_errno(ENOMEM);
+       }
+
+       if (pshared != 0){
+           free(s);
+               //share between processes
+               return ls_set_errno(EPERM);
+       }
+
+       if ((s->handle = CreateSemaphoreA(NULL, value, SEM_VALUE_MAX, NULL)) == NULL){
+               free(s);
+               return ls_set_errno(ENOSPC);
+       }
+
+       *sem = s;
+       return 0;
+}
+
+
+//Wait for a semaphore
+int sem_wait(sem_t *sem){
+    if (sem == NULL || *sem == NULL) {
+          return ls_set_errno(EINVAL);
+    }
+       sem_t s = *sem;
+
+       if (WaitForSingleObject(s->handle, INFINITE) != WAIT_OBJECT_0){
+               return ls_set_errno(EINVAL);
+       }
+
+       return 0;
+}
+
+
+//Wait for a semaphore
+int sem_timedwait(sem_t *sem, const struct timespec *ts) {
+    if (sem == NULL || *sem == NULL) {
+        return ls_set_errno(EINVAL);
+    }
+
+    sem_t s = *sem;
+
+       struct timespec cts;
+       if (clock_gettime(CLOCK_REALTIME, &cts) == -1) {
+               return ls_set_errno(EINVAL);
+       }
+
+       unsigned long long t = (ts->tv_sec - cts.tv_sec) * 1000;
+       t += (ts->tv_nsec - cts.tv_nsec) / 1000000;
+
+       if (WaitForSingleObject(s->handle, t) != WAIT_OBJECT_0) {
+               return ls_set_errno(EINVAL);
+       }
+
+       return 0;
+}
+
+
+//Release a semaphone
+int sem_post(sem_t *sem){
+    if (sem == NULL || *sem == NULL){
+        return ls_set_errno(EINVAL);
+    }
+
+       sem_t s = *sem;
+       if (ReleaseSemaphore(s->handle, 1, NULL) == 0){
+               return ls_set_errno(EINVAL);
+       }
+
+       return 0;
+}
+
+
+
+//Destroy a semaphore
+int sem_destroy(sem_t *sem){
+    if (sem == NULL || *sem == NULL){
+        return ls_set_errno(EINVAL);
+    }
+
+       sem_t s = *sem;
+       if (CloseHandle(s->handle) == 0){
+               return ls_set_errno(EINVAL);
+       }
+
+       free(s);
+       *sem = NULL;
+       return 0;
+}
diff --git a/inference-engine/thirdparty/movidius/WinPthread/win_semaphore.h b/inference-engine/thirdparty/movidius/WinPthread/win_semaphore.h
new file mode 100644 (file)
index 0000000..4c851dc
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+
+#ifndef _SEMAPHORE_H_
+#define _SEMAPHORE_H_ 
+
+
+#include <errno.h>
+#include <fcntl.h>
+#include <windows.h>
+#include <stdio.h>
+
+
+#if !defined(malloc)
+#include <malloc.h>
+#endif
+#if !defined(INT_MAX)
+#include <limits.h>
+#endif
+
+
+#ifndef SEM_VALUE_MAX
+#define SEM_VALUE_MAX           INT_MAX
+#endif
+
+
+#if (_MSC_VER == 1800)
+struct timespec
+{
+       /* long long in windows is the same as long in unix for 64bit */
+       long long tv_sec;
+       long long tv_nsec;
+};
+#elif (_MSC_VER >= 1800)
+#include "time.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sem_t_
+{
+    HANDLE handle;
+};
+typedef struct sem_t_ * sem_t;
+
+
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+int sem_wait(sem_t *sem);
+int sem_timedwait(sem_t *sem, const struct timespec *ts);
+int sem_post(sem_t *sem);
+int sem_destroy(sem_t *sem);
+
+
+#ifdef __cplusplus
+       }
+#endif
+
+
+#endif /* _SEMAPHORE_H_ */
diff --git a/inference-engine/thirdparty/movidius/XLink/CMakeLists.txt b/inference-engine/thirdparty/movidius/XLink/CMakeLists.txt
new file mode 100644 (file)
index 0000000..faac30a
--- /dev/null
@@ -0,0 +1,74 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "XLink")
+
+if(NOT WIN32)
+    find_package(Threads REQUIRED)
+
+    find_path(LIBUSB_INCLUDE_DIR NAMES libusb.h PATH_SUFFIXES "include" "libusb" "libusb-1.0")
+    find_library(LIBUSB_LIBRARY NAMES usb-1.0 PATH_SUFFIXES "lib")
+
+    if(NOT LIBUSB_INCLUDE_DIR OR NOT LIBUSB_LIBRARY)
+        message(FATAL_ERROR "libusb is required")
+    endif()
+endif()
+
+file(GLOB_RECURSE SOURCES *.c *.h)
+
+# FIXME: WIN_PTHREAD also should be built as a library
+if(WIN32)
+    file(GLOB USB_WIN_SOURCES "../USB_WIN/*")
+    file(GLOB WIN_PTHREAD_SOURCES "../WinPthread/*")
+    list(APPEND SOURCES ${USB_WIN_SOURCES} ${WIN_PTHREAD_SOURCES})
+endif()
+
+add_library(${TARGET_NAME} STATIC ${SOURCES})
+
+if(WIN32)
+    target_include_directories(${TARGET_NAME}
+            PRIVATE
+            "../USB_WIN"
+            "../WinPthread")
+else()
+    target_include_directories(${TARGET_NAME}
+            PRIVATE
+            "${LIBUSB_INCLUDE_DIR}")
+endif()
+
+target_include_directories(${TARGET_NAME}
+        PUBLIC
+            "shared"
+            "../shared/include"
+            "pc")
+
+if(NOT WIN32)
+    target_link_libraries(${TARGET_NAME}
+            PUBLIC
+                Threads::Threads
+                ${LIBUSB_LIBRARY})
+endif()
+
+target_compile_definitions(${TARGET_NAME}
+        PRIVATE
+            __PC__
+            HAVE_STRUCT_TIMESPEC
+            _CRT_SECURE_NO_WARNINGS
+        )
+
+if (ENABLE_MYX_PCIE)
+    target_compile_definitions(${TARGET_NAME}
+        PRIVATE
+            USE_PCIE)
+elseif (ENABLE_MYRIAD_NO_BOOT)
+    target_compile_definitions(${TARGET_NAME}
+        PRIVATE
+            NO_BOOT
+            USE_USB_VSC)
+else()
+    target_compile_definitions(${TARGET_NAME}
+        PRIVATE
+            USE_USB_VSC)
+endif()
+
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/XLinkPlatform.c b/inference-engine/thirdparty/movidius/XLink/pc/XLinkPlatform.c
new file mode 100644 (file)
index 0000000..d74118d
--- /dev/null
@@ -0,0 +1,736 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/timeb.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "XLinkPlatform.h"
+#include "usb_boot.h"
+#include "pcie_host.h"
+
+#if (defined(_WIN32) || defined(_WIN64))
+#include "usb_winusb.h"
+#include "gettime.h"
+#include "win_pthread.h"
+extern void initialize_usb_boot();
+#else
+#include <unistd.h>
+#include <sys/wait.h>
+#include <sys/un.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <termios.h>
+#include <libusb.h>
+#endif  /*defined(_WIN32) || defined(_WIN64)*/
+
+#ifdef USE_LINK_JTAG
+#include <sys/types.h>          /* See NOTES */
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#endif  /*USE_LINK_JTAG*/
+
+#define USBLINK_ERROR_PRINT
+
+#ifdef USBLINK_ERROR_PRINT
+#define USBLINK_ERROR(...) printf(__VA_ARGS__)
+#else
+#define USBLINK_ERROR(...) (void)0
+#endif  /*USBLINK_ERROR_PRINT*/
+#ifdef USBLINKDEBUG
+
+
+#define USBLINK_PRINT(...) printf(__VA_ARGS__)
+#else
+#define USBLINK_PRINT(...) (void)0
+#endif  /*USBLINKDEBUG*/
+#ifdef USBLINKWARN
+
+
+#define USBLINK_WARN(...) printf(__VA_ARGS__)
+#else
+#define USBLINK_WARN(...) (void)0
+#endif  /*USBLINKWARN*/
+
+#define USB_LINK_SOCKET_PORT 5678
+#define MAX_EVENTS 64
+#define USB_ENDPOINT_IN 0x81
+#define USB_ENDPOINT_OUT 0x01
+
+#ifndef USE_USB_VSC
+int usbFdWrite = -1;
+int usbFdRead = -1;
+#endif  /*USE_USB_VSC*/
+
+static int statuswaittimeout = 5;
+
+#include <assert.h>
+
+pthread_t readerThreadId;
+
+// Communication protocol used
+// Few functions could be called without XLink initialization
+#ifdef USE_PCIE
+int gl_protocol = PCIE;
+#else   // use USB as default
+int gl_protocol = USB_VSC;
+#endif
+
+#define MAX_EVENTS 64
+#if (defined(_WIN32) || defined(_WIN64))
+extern void initialize_usb_boot();
+#endif
+
+static xLinkPlatformErrorCode_t parseUsbBootError(usbBootError_t rc) {
+    switch (rc) {
+        case USB_BOOT_SUCCESS:
+            return X_LINK_PLATFORM_SUCCESS;
+        case USB_BOOT_DEVICE_NOT_FOUND:
+            return X_LINK_PLATFORM_DEVICE_NOT_FOUND;
+        case USB_BOOT_TIMEOUT:
+            return X_LINK_PLATFORM_TIMEOUT;
+        default:
+            return X_LINK_PLATFORM_ERROR;
+    }
+}
+
+static int usb_write(libusb_device_handle *f, const void *data, size_t size, unsigned int timeout)
+{
+    while(size > 0)
+    {
+        int bt, ss = size;
+        if(ss > 1024*1024*5)
+            ss = 1024*1024*5;
+#if (defined(_WIN32) || defined(_WIN64) )
+        int rc = usb_bulk_write(f, USB_ENDPOINT_OUT, (unsigned char *)data, ss, &bt, timeout);
+#else
+        int rc = libusb_bulk_transfer(f, USB_ENDPOINT_OUT, (unsigned char *)data, ss, &bt, timeout);
+#endif
+        if(rc)
+            return rc;
+        data = (char *)data + bt;
+        size -= bt;
+    }
+    return 0;
+}
+
+
+static int usb_read(libusb_device_handle *f, void *data, size_t size, unsigned int timeout)
+{
+    while(size > 0)
+    {
+        int bt, ss = size;
+        if(ss > 1024*1024*5)
+            ss = 1024*1024*5;
+#if (defined(_WIN32) || defined(_WIN64))
+        int rc = usb_bulk_read(f, USB_ENDPOINT_IN, (unsigned char *)data, ss, &bt, timeout);
+#else
+        int rc = libusb_bulk_transfer(f, USB_ENDPOINT_IN,(unsigned char *)data, ss, &bt, timeout);
+#endif
+        if(rc)
+            return rc;
+        data = ((char *)data) + bt;
+        size -= bt;
+    }
+    return 0;
+}
+
+
+static double seconds()
+{
+    static double s;
+    struct timespec ts;
+
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    if(!s)
+        s = ts.tv_sec + ts.tv_nsec * 1e-9;
+    return ts.tv_sec + ts.tv_nsec * 1e-9 - s;
+}
+
+libusb_device_handle *usblink_open(const char *path)
+{
+    if (path == NULL) {
+        return 0;
+    }
+
+    usbBootError_t rc = USB_BOOT_DEVICE_NOT_FOUND;
+    libusb_device_handle *h = NULL;
+    libusb_device *dev = NULL;
+    double waittm = seconds() + statuswaittimeout;
+    while(seconds() < waittm){
+        int size = strlen(path);
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+        uint16_t  bcdusb = -1;
+        rc = usb_find_device_with_bcd(0, (char *)path, size, (void **)&dev, DEFAULT_OPENVID, DEFAULT_OPENPID,&bcdusb);
+#else
+               rc = usb_find_device(0, (char *)path, size, (void **)&dev, DEFAULT_OPENVID, DEFAULT_OPENPID);
+#endif
+        if(rc == USB_BOOT_SUCCESS)
+            break;
+        usleep(1000);
+    }
+    if (rc == USB_BOOT_TIMEOUT || rc == USB_BOOT_DEVICE_NOT_FOUND) // Timeout
+        return 0;
+#if (defined(_WIN32) || defined(_WIN64) )
+       h = usb_open_device(dev, NULL, 0, stderr);
+       int libusb_rc = ((h != NULL) ? (0) : (-1));
+       if (libusb_rc < 0)
+       {
+               usb_close_device(h);
+               usb_free_device(dev);
+               return 0;
+       }
+       usb_free_device(dev);
+#else
+    int libusb_rc = libusb_open(dev, &h);
+    if (libusb_rc < 0)
+    {
+        libusb_unref_device(dev);
+        return 0;
+    }
+    libusb_unref_device(dev);
+    libusb_detach_kernel_driver(h, 0);
+    libusb_rc = libusb_claim_interface(h, 0);
+    if(libusb_rc < 0)
+    {
+        libusb_close(h);
+        return 0;
+    }
+#endif
+    return h;
+}
+
+void usblink_close(libusb_device_handle *f)
+{
+#if (defined(_WIN32) || defined(_WIN64))
+    usb_close_device(f);
+#else
+    libusb_release_interface(f, 0);
+    libusb_close(f);
+#endif
+}
+
+int USBLinkWrite(void* fd, void* data, int size, unsigned int timeout)
+{
+    int rc = 0;
+#ifndef USE_USB_VSC
+    int byteCount = 0;
+#ifdef USE_LINK_JTAG
+    while (byteCount < size){
+        byteCount += write(usbFdWrite, &((char*)data)[byteCount], size - byteCount);
+        printf("write %d %d\n", byteCount, size);
+    }
+#else
+    if(usbFdWrite < 0)
+    {
+        return -1;
+    }
+    while(byteCount < size)
+    {
+       int toWrite = (PACKET_LENGTH && (size - byteCount > PACKET_LENGTH)) \
+                        ? PACKET_LENGTH:size - byteCount;
+       int wc = write(usbFdWrite, ((char*)data) + byteCount, toWrite);
+
+       if ( wc != toWrite)
+       {
+           return -2;
+       }
+
+       byteCount += toWrite;
+       unsigned char acknowledge;
+       int rc;
+       rc = read(usbFdWrite, &acknowledge, sizeof(acknowledge));
+
+       if ( rc < 0)
+       {
+           return -2;
+       }
+
+       if (acknowledge != 0xEF)
+       {
+           return -2;
+       }
+    }
+#endif  /*USE_LINK_JTAG*/
+#else
+    rc = usb_write((libusb_device_handle *) fd, data, size, timeout);
+#endif  /*USE_USB_VSC*/
+    return rc;
+}
+
+ int USBLinkRead(void* fd, void* data, int size, unsigned int timeout)
+{
+    // FIXME USE_LINK_JTAG not compiled
+#ifndef USE_PCIE
+    int rc = 0;
+#ifndef USE_USB_VSC
+    int nread =  0;
+#ifdef USE_LINK_JTAG
+    while (nread < size){
+        nread += read(usbFdWrite, &((char*)data)[nread], size - nread);
+        printf("read %d %d\n", nread, size);
+    }
+#else
+    if(usbFdRead < 0)
+    {
+        return -1;
+    }
+
+    while(nread < size)
+    {
+        int toRead = (PACKET_LENGTH && (size - nread > PACKET_LENGTH)) \
+                        ? PACKET_LENGTH : size - nread;
+
+        while(toRead > 0)
+        {
+            rc = read(usbFdRead, &((char*)gl_protocoldata)[nread], toRead);
+            if ( rc < 0)
+            {
+                return -2;
+            }
+            toRead -=rc;
+            nread += rc;
+        }
+        unsigned char acknowledge = 0xEF;
+        int wc = write(usbFdRead, &acknowledge, sizeof(acknowledge));
+        if (wc != sizeof(acknowledge))
+        {
+            return -2;
+        }
+    }
+#endif  /*USE_LINK_JTAG*/
+#else
+    rc = usb_read((libusb_device_handle *) fd, data, size, timeout);
+#endif  /*USE_USB_VSC*/
+    return rc;
+#endif  // USE_PCIE
+    return 0;
+}
+
+int USBLinkPlatformResetRemote(void* fd)
+{
+
+#ifndef USE_USB_VSC
+#ifdef USE_LINK_JTAG
+    /*Nothing*/
+#else
+    if (usbFdRead != -1){
+        close(usbFdRead);
+        usbFdRead = -1;
+    }
+    if (usbFdWrite != -1){
+        close(usbFdWrite);
+        usbFdWrite = -1;
+    }
+#endif  /*USE_LINK_JTAG*/
+#else
+    usblink_close((libusb_device_handle *) fd);
+#endif  /*USE_USB_VSC*/
+    return -1;
+}
+
+int UsbLinkPlatformConnect(const char* devPathRead, const char* devPathWrite, void** fd)
+{
+    #if (!defined(USE_USB_VSC) && !defined(USE_PCIE))
+#ifdef USE_LINK_JTAG
+    struct sockaddr_in serv_addr;
+    usbFdWrite = socket(AF_INET, SOCK_STREAM, 0);
+    usbFdRead = socket(AF_INET, SOCK_STREAM, 0);
+    assert(usbFdWrite >=0);
+    assert(usbFdRead >=0);
+    memset(&serv_addr, '0', sizeof(serv_addr));
+
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+    serv_addr.sin_port = htons(USB_LINK_SOCKET_PORT);
+
+    if (connect(usbFdWrite, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0)
+    {
+        perror("ERROR connecting");
+        exit(1);
+    }
+    printf("this is working\n");
+    return 0;
+
+#else
+    usbFdRead= open(devPathRead, O_RDWR);
+    if(usbFdRead < 0)
+    {
+        return -1;
+    }
+    // set tty to raw mode
+    struct termios  tty;
+    speed_t     spd;
+    int rc;
+    rc = tcgetattr(usbFdRead, &tty);
+    if (rc < 0) {
+        usbFdRead = -1;
+        return -2;
+    }
+
+    spd = B115200;
+    cfsetospeed(&tty, (speed_t)spd);
+    cfsetispeed(&tty, (speed_t)spd);
+
+    cfmakeraw(&tty);
+
+    rc = tcsetattr(usbFdRead, TCSANOW, &tty);
+    if (rc < 0) {
+        usbFdRead = -1;
+        return -2;
+    }
+
+    usbFdWrite= open(devPathWrite, O_RDWR);
+    if(usbFdWrite < 0)
+    {
+        usbFdWrite = -1;
+        return -2;
+    }
+    // set tty to raw mode
+    rc = tcgetattr(usbFdWrite, &tty);
+    if (rc < 0) {
+        usbFdWrite = -1;
+        return -2;
+    }
+
+    spd = B115200;
+    cfsetospeed(&tty, (speed_t)spd);
+    cfsetispeed(&tty, (speed_t)spd);
+
+    cfmakeraw(&tty);
+
+    rc = tcsetattr(usbFdWrite, TCSANOW, &tty);
+    if (rc < 0) {
+        usbFdWrite = -1;
+        return -2;
+    }
+    return 0;
+#endif  /*USE_LINK_JTAG*/
+#else
+    *fd = usblink_open(devPathWrite);
+    if (*fd == 0)
+    {
+       /* could fail due to port name change */
+       return -1;
+    }
+
+    if(*fd)
+        return 0;
+    else
+        return -1;
+#endif  /*USE_USB_VSC*/
+}
+
+int UsbLinkPlatformInit(int loglevel)
+{
+    usb_loglevel = loglevel;
+#if (defined(_WIN32) || defined(_WIN64))
+    initialize_usb_boot();
+#endif
+    return 0;
+}
+
+void deallocateData(void* ptr,uint32_t size, uint32_t alignment)
+{
+    if (!ptr)
+        return;
+#if (defined(_WIN32) || defined(_WIN64) )
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+void* allocateData(uint32_t size, uint32_t alignment)
+{
+    void* ret = NULL;
+#if (defined(_WIN32) || defined(_WIN64) )
+    ret = _aligned_malloc(size, alignment);
+#else
+    if (posix_memalign(&ret, alignment, size) != 0) {
+        perror("memalign failed");
+    }
+#endif
+    return ret;
+}
+
+
+/*#################################################################################
+################################### PCIe FUNCTIONS ################################
+##################################################################################*/
+
+static int write_pending = 0;
+static int read_pending = 0;
+
+
+static int pcie_host_write(void *f,
+                           void *data, int size,
+                           unsigned int timeout)
+{
+#define CHUNK_SIZE_BYTES (5ULL * 1024ULL * 1024ULL)
+
+    while (size)
+    {
+        write_pending = 1;
+
+        size_t chunk = size < CHUNK_SIZE_BYTES ? size : CHUNK_SIZE_BYTES;
+        int num_written = pcie_write(f, data, chunk, timeout);
+
+        write_pending = 0;
+
+        if (num_written == -EAGAIN)  {
+            // Let read commands be submitted
+            if (read_pending > 0) {
+                usleep(1000);
+            }
+            continue;
+        }
+
+        if (num_written < 0) {
+            return num_written;
+        }
+
+        data = ((char*) data) + num_written;
+        /**
+         * num_written is always not greater than size
+         */
+        size -= num_written;
+    }
+
+    return 0;
+#undef CHUNK_SIZE_BYTES
+}
+
+static int pcie_host_read(void *f,
+                          void *data, int size,
+                          unsigned int timeout)
+{
+    while (size)
+    {
+        read_pending = 1;
+
+        int num_read = pcie_read(f, data, size, timeout);
+
+        read_pending = 0;
+
+        if (num_read == -EAGAIN)  {
+            // Let write commands be submitted
+            if (write_pending > 0) {
+                usleep(1000);
+            }
+            continue;
+        }
+
+        if(num_read < 0) {
+            return num_read;
+        }
+
+        data = ((char *)data) + num_read;
+        /**
+         * num_read is always not greater than size
+         */
+        size -= num_read;
+    }
+
+    return 0;
+}
+
+static int pcie_host_open(const char* devPathRead,
+                          const char* devPathWrite,
+                          void** fd )
+{
+    return pcie_init(devPathWrite, fd);
+}
+
+static int pcie_host_close(void *f)
+{
+    pcie_close(f);
+    return 0;
+}
+
+/*############################### FUNCTION ARRAYS #################################*/
+/*These arrays hold the write/read/open/close operation functions
+specific for each communication protocol.
+Add more functions if adding another protocol*/
+int (*write_fcts[NMB_OF_PROTOCOLS])(void*, void*, int, unsigned int) = \
+                            {USBLinkWrite, USBLinkWrite, pcie_host_write};
+int (*read_fcts[NMB_OF_PROTOCOLS])(void*, void*, int, unsigned int) = \
+                            {USBLinkRead, XLinkRead, pcie_host_read};
+int (*open_fcts[NMB_OF_PROTOCOLS])(const char*, const char*, void**) = \
+                            {UsbLinkPlatformConnect, UsbLinkPlatformConnect, pcie_host_open};
+int (*close_fcts[NMB_OF_PROTOCOLS])(void*) = \
+                            {USBLinkPlatformResetRemote, USBLinkPlatformResetRemote, pcie_host_close};
+
+
+/*#################################################################################
+###################################################################################
+###################################### EXTERNAL ###################################
+###################################################################################
+##################################################################################*/
+int XLinkPlatformConnect(const char* devPathRead, const char* devPathWrite, void** fd)
+{
+    return open_fcts[gl_protocol](devPathRead, devPathWrite, fd);
+}
+
+int XLinkWrite(void* fd, void* data, int size, unsigned int timeout)
+{
+    return write_fcts[gl_protocol](fd, data, size, timeout);
+}
+
+int XLinkRead(void* fd, void* data, int size, unsigned int timeout)
+{
+    return read_fcts[gl_protocol](fd, data, size, timeout);
+}
+
+int XLinkPlatformCloseRemote(void *fd)
+{
+    return close_fcts[gl_protocol](fd);
+}
+
+int XLinkPlatformInit(XLinkProtocol_t protocol, int loglevel)
+{
+    gl_protocol = protocol;
+    usb_loglevel = loglevel;
+#if (defined(_WIN32) || defined(_WIN64))
+    initialize_usb_boot();
+#endif
+    return 0;
+}
+
+static int getDeviceName(int index, char* name, int nameSize , int pid)
+{
+    if (index < 0 ) {
+        perror("Incorrect index value\n");
+        return X_LINK_PLATFORM_ERROR;
+    }
+    switch (gl_protocol) {
+        case PCIE: {
+            return pcie_find_device_port(index, name, nameSize);
+        }
+        case IPC:
+            perror("IPC not supported, switched to USB\n");
+            break;
+        case USB_CDC:
+            perror("USB_CDC not supported, switch to USB\n");
+            break;
+        case USB_VSC:
+            /*should have common device(temporary moved to 'default')*/
+        default:
+        {
+            // At the moment there is no situation where you may need a non standard vid
+            int vid = AUTO_PID;
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+            uint16_t  bcdusb = -1;
+            usbBootError_t rc = usb_find_device_with_bcd(index, name, nameSize, 0, vid, pid, &bcdusb);
+#else
+            usbBootError_t rc = usb_find_device(index, name, nameSize, 0, vid, pid);
+#endif
+            // TODO convert usb error to xLinkPlatformErrorCode
+            return parseUsbBootError(rc);
+        }
+    }
+    return X_LINK_PLATFORM_SUCCESS;
+}
+
+int XLinkPlatformGetDeviceName(int index, char* name, int nameSize)
+{
+    return getDeviceName(index, name, nameSize, AUTO_PID);
+}
+
+int XLinkPlatformGetDeviceNameExtended(int index, char* name, int nameSize, int pid)
+{
+    return getDeviceName(index, name, nameSize, pid);
+}
+
+int XLinkPlatformBootRemote(const char* deviceName, const char* binaryPath)
+{
+/* Don't try to boot FW if PCIe */
+#ifdef USE_PCIE
+    return 0;
+#else
+    long filesize;
+    FILE *fp;
+    char *tx_buf;
+    char subaddr[28+2];
+    int rc;
+
+#ifndef USE_USB_VSC
+    if (usbFdRead != -1){
+        close(usbFdRead);
+        usbFdRead = -1;
+    }
+    if (usbFdWrite != -1){
+        close(usbFdWrite);
+        usbFdWrite = -1;
+    }
+#endif  /*USE_USB_VSC*/
+
+    // Load the executable
+    fp = fopen(binaryPath, "rb");
+    if(fp == NULL)
+    {
+        if(usb_loglevel)
+            perror(binaryPath);
+        return -7;
+    }
+    fseek(fp, 0, SEEK_END);
+    filesize = ftell(fp);
+    rewind(fp);    
+    if(filesize <= 0 || !(tx_buf = (char*)malloc(filesize)))
+    {
+        if(usb_loglevel)
+            perror("buffer");
+        fclose(fp);
+        return -3;
+    }
+    if(fread(tx_buf, 1, filesize, fp) != filesize)
+    {
+        if(usb_loglevel)
+            perror(binaryPath);
+        fclose(fp);
+        free(tx_buf);
+        return -7;
+    }
+    fclose(fp);
+
+    // This will be the string to search for in /sys/dev/char links
+    int chars_to_write = snprintf(subaddr, 28, "-%s:", deviceName);
+    if(chars_to_write >= 28) {
+        printf("Path to your boot util is too long for the char array here!\n");
+    }
+    // Boot it
+    rc = usb_boot(deviceName, tx_buf, filesize);
+    free(tx_buf);
+    if(rc)
+    {
+        return rc;
+    }
+    if(usb_loglevel > 1)
+        fprintf(stderr, "Boot successful, device address %s\n", deviceName);
+    return 0;
+#endif  // USE_PCIE
+}
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/pcie_host.c b/inference-engine/thirdparty/movidius/XLink/pc/pcie_host.c
new file mode 100644 (file)
index 0000000..b3114de
--- /dev/null
@@ -0,0 +1,251 @@
+/*
+* Copyright 2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+#include "XLinkPlatform.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+#if (defined(_WIN32) || defined(_WIN64))
+#include <windows.h>
+#include "gettime.h"
+#include <setupapi.h>
+#include <strsafe.h>
+#include <cfgmgr32.h>
+#include <tchar.h>
+#else
+#include <sys/types.h>
+#include <unistd.h>
+#include <dirent.h>
+#endif
+
+#define MVLOG_UNIT_NAME PCIe
+#include "mvLog.h"
+
+#define PCIE_DEVICE_ID 0x6200
+#define PCIE_VENDOR_ID 0x8086
+
+#if (defined(_WIN32) || defined(_WIN64))
+int pcie_write(HANDLE fd, void * buf, size_t bufSize, int timeout)
+{
+    int bytesWritten;
+    HANDLE dev = fd;
+
+    BOOL ret = WriteFile(dev, buf, bufSize, &bytesWritten, 0);
+
+    if (ret == FALSE)
+        return -errno;
+
+    return bytesWritten;
+}
+#else
+int pcie_write(void *fd, void * buf, size_t bufSize, int timeout)
+{
+    int ret = write(*((int*)fd), buf, bufSize);
+
+    if (ret < 0) {
+        return -errno;
+    }
+    return ret;
+}
+#endif  // (defined(_WIN32) || defined(_WIN64))
+
+#if (defined(_WIN32) || defined(_WIN64))
+int pcie_read(HANDLE fd, void * buf, size_t bufSize, int timeout)
+{
+    int bytesRead;
+    HANDLE dev = fd;
+    BOOL ret = ReadFile(dev, buf, bufSize, &bytesRead, 0);
+
+    if (ret == FALSE) {
+        return -errno;
+    }
+
+   return bytesRead;
+}
+#else
+int pcie_read(void *fd, void *buf, size_t bufSize, int timeout)
+{
+    int ret = read(*((int*)fd), buf, bufSize);
+
+    if (ret < 0) {
+        return -errno;
+    }
+    return ret;
+}
+#endif
+
+
+#if (defined(_WIN32) || defined(_WIN64))
+int pcie_init(const char *slot, HANDLE *fd)
+{
+    HANDLE hDevice = CreateFile(slot,
+        GENERIC_READ | GENERIC_WRITE,
+        FILE_SHARE_READ | FILE_SHARE_WRITE,
+        NULL,
+        OPEN_EXISTING,
+        0,
+        NULL);
+
+    if (hDevice == INVALID_HANDLE_VALUE) {
+        mvLog(MVLOG_ERROR, "Failed to open device. Error %d", GetLastError());
+        return -1;
+    }
+
+    *fd = hDevice;
+
+    return 0;
+}
+#else
+int pcie_init(const char *slot, void **fd)
+{
+    int mx_fd = open(slot, O_RDWR);
+
+    if (mx_fd == -1) {
+        return -1;
+    } else {
+        if (!(*fd)) {
+            *fd = (int *) malloc(sizeof(int));
+        }
+
+        if (!(*fd)) {
+            mvLog(MVLOG_ERROR, "Memory allocation failed");
+            close(mx_fd);
+            return -1;
+        }
+        *((int*)*fd) = mx_fd;
+    }
+
+    return 0;
+}
+#endif
+
+int pcie_close(void *fd)
+{
+#if (defined(_WIN32) || defined(_WIN64))
+    return 0;
+#else
+    if (!fd) {
+        mvLog(MVLOG_ERROR, "Incorrect device filedescriptor");
+        return -1;
+    }
+    int mx_fd = *((int*) fd);
+    close(mx_fd);
+    free(fd);
+    
+    return 0;
+#endif
+}
+
+#if (defined(_WIN32) || defined(_WIN64))
+int pci_count_devices(uint16_t vid, uint16_t pid)
+{
+    int i;
+    int deviceCnt = 0;
+
+    HDEVINFO hDevInfo;
+    SP_DEVINFO_DATA DeviceInfoData;
+    char hwid_buff[256];
+    DeviceInfoData.cbSize = sizeof(DeviceInfoData);
+
+    // List all connected PCI devices
+    hDevInfo = SetupDiGetClassDevs(NULL, TEXT("PCI"), NULL, DIGCF_PRESENT | DIGCF_ALLCLASSES);
+    if (hDevInfo == INVALID_HANDLE_VALUE)
+        return -1;
+
+
+    for (i = 0; SetupDiEnumDeviceInfo(hDevInfo, i, &DeviceInfoData); i++)
+    {
+        DeviceInfoData.cbSize = sizeof(DeviceInfoData);
+        if (!SetupDiEnumDeviceInfo(hDevInfo, i, &DeviceInfoData))
+            break;
+
+        if (!SetupDiGetDeviceRegistryPropertyA(hDevInfo, &DeviceInfoData, SPDRP_HARDWAREID, NULL, (PBYTE)hwid_buff, sizeof(hwid_buff), NULL)) {
+            continue;
+        }
+
+        uint16_t venid, devid;
+        if (sscanf_s(hwid_buff, "PCI\\VEN_%hx&DEV_%hx", (int16_t *)&venid, (int16_t *)&devid) != 2) {
+            continue;
+        }
+        if (venid == vid && devid == pid)
+        {
+            deviceCnt++;
+        }
+    }
+    return deviceCnt;
+}
+#endif  // (defined(_WIN32) || defined(_WIN64))
+
+xLinkPlatformErrorCode_t pcie_find_device_port(int index, char* port_name, int size) {
+#if (defined(_WIN32) || defined(_WIN64))
+    snprintf(port_name, size, "%s%d", "\\\\.\\mxlink", index);
+
+    if (pci_count_devices(PCIE_VENDOR_ID, PCIE_DEVICE_ID) == 0) {
+        mvLog(MVLOG_WARN, "No PCIe device(s) with Vendor ID: 0x%hX and Device ID: 0x%hX found",
+                PCIE_VENDOR_ID, PCIE_DEVICE_ID);
+        return X_LINK_PLATFORM_DEVICE_NOT_FOUND;
+    }
+
+    if (index > pci_count_devices(PCIE_VENDOR_ID, PCIE_DEVICE_ID)) {
+        return X_LINK_PLATFORM_DEVICE_NOT_FOUND;
+    }
+
+    return X_LINK_PLATFORM_SUCCESS;
+
+#else
+    xLinkPlatformErrorCode_t rc = X_LINK_PLATFORM_DEVICE_NOT_FOUND;
+    struct dirent *entry;
+    DIR *dp;
+    if (port_name == NULL)
+        return X_LINK_PLATFORM_ERROR;
+
+    dp = opendir("/sys/class/mxlk/");
+    if (dp == NULL)
+    {
+        mvLog(MVLOG_ERROR, "Unable to find a PCIe device. Make sure the driver is installed correctly.");
+        return X_LINK_PLATFORM_DRIVER_NOT_LOADED;
+    }
+
+    // All entries in this (virtual) directory are generated when the driver
+    // is loaded, and correspond 1:1 to entries in /dev/
+    int device_cnt = 0;
+    while((entry = readdir(dp))) {
+        // Compare the beginning of the name to make sure it is a device name
+        if (strncmp(entry->d_name, "mxlk", 4) == 0)
+        {
+            if (device_cnt == index)
+            {
+                snprintf(port_name, size, "/dev/%s", entry->d_name);
+                rc = X_LINK_PLATFORM_SUCCESS;
+                break;
+            }
+            device_cnt++;
+        }
+    }
+    closedir(dp);
+
+    return rc;
+#endif  // (!defined(_WIN32) && !defined(_WIN64))
+}
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/pcie_host.h b/inference-engine/thirdparty/movidius/XLink/pc/pcie_host.h
new file mode 100644 (file)
index 0000000..9562ae0
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+* Copyright 2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#ifndef PCIE_HOST_H
+#define PCIE_HOST_H
+
+#include "XLinkPlatform.h"
+
+int pcie_init(const char *slot, void **fd);
+int pcie_write(void *fd, void * buf, size_t bufSize, int timeout);
+int pcie_read(void *fd, void *buf, size_t bufSize, int timeout);
+int pcie_close(void *fd);
+xLinkPlatformErrorCode_t pcie_find_device_port(int index, char* port_name, int size);
+
+#endif  // PCIE_HOST_H
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/usb_boot.c b/inference-engine/thirdparty/movidius/XLink/pc/usb_boot.c
new file mode 100644 (file)
index 0000000..e857e5d
--- /dev/null
@@ -0,0 +1,673 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+// USB utility for use with Myriad2v2 ROM
+// Very heavily modified from Sabre version of usb_boot
+// Copyright(C) 2015 Movidius Ltd.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+#include <string.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#if (defined(_WIN32) || defined(_WIN64) )
+#include "usb_winusb.h"
+#include "gettime.h"
+#include "win_pthread.h"
+#else
+#include <unistd.h>
+#include <getopt.h>
+#include <libusb.h>
+#include <pthread.h>
+#endif
+#include "usb_boot.h"
+
+
+#define DEFAULT_VID                                    0x03E7
+
+#define DEFAULT_WRITE_TIMEOUT            2000
+#define DEFAULT_CONNECT_TIMEOUT    20000
+#define DEFAULT_SEND_FILE_TIMEOUT  10000
+#define DEFAULT_CHUNKSZ                                1024*1024
+#define USB1_CHUNKSZ       64
+
+#define OPEN_DEV_ERROR_MESSAGE_LENGTH 128
+
+static unsigned int bulk_chunklen = DEFAULT_CHUNKSZ;
+static int write_timeout = DEFAULT_WRITE_TIMEOUT;
+static int connect_timeout = DEFAULT_CONNECT_TIMEOUT;
+static int initialized;
+
+typedef struct {
+    int pid;
+    char name[10];
+} deviceBootInfo_t;
+
+static deviceBootInfo_t supportedDevices[] = {
+    {
+        .pid = 0x2150,
+        .name = "ma2450"
+    },
+    {
+        .pid = 0x2485,
+        .name = "ma2480"
+    },
+    {
+        //To support the case where the port name change, or it's already booted
+        .pid = DEFAULT_OPENPID,
+        .name = ""
+    }
+};
+// for now we'll only use the loglevel for usb boot. can bring it into
+// the rest of usblink later
+// use same levels as mvnc_loglevel for now
+int usb_loglevel;
+#if (defined(_WIN32) || defined(_WIN64) )
+void initialize_usb_boot()
+{
+    if (initialized == 0)
+    {
+        usb_init();
+    }
+    // We sanitize the situation by trying to reset the devices that have been left open
+    initialized = 1;
+}
+#else
+void __attribute__((constructor)) usb_library_load()
+{
+    initialized = !libusb_init(NULL);
+}
+
+void __attribute__((destructor)) usb_library_unload()
+{
+    if(initialized)
+        libusb_exit(NULL);
+}
+#endif
+
+typedef struct timespec highres_time_t;
+
+static inline void highres_gettime(highres_time_t *ptr) {
+    clock_gettime(CLOCK_REALTIME, ptr);
+}
+
+static inline double highres_elapsed_ms(highres_time_t *start, highres_time_t *end) {
+    struct timespec temp;
+    if((end->tv_nsec - start->tv_nsec) < 0) {
+        temp.tv_sec = end->tv_sec - start->tv_sec - 1;
+        temp.tv_nsec = 1000000000 + end->tv_nsec-start->tv_nsec;
+    } else {
+        temp.tv_sec = end->tv_sec - start->tv_sec;
+        temp.tv_nsec = end->tv_nsec - start->tv_nsec;
+    }
+    return (double)(temp.tv_sec * 1000) + (((double)temp.tv_nsec) * 0.000001);
+}
+
+static const char *get_pid_name(int pid)
+{
+    int n = sizeof(supportedDevices)/sizeof(supportedDevices[0]);
+    int i;
+
+    for (i = 0; i < n; i++)
+    {
+        if (supportedDevices[i].pid == pid)
+            return supportedDevices[i].name;
+    }
+
+    if(usb_loglevel)
+        fprintf(stderr, "%s(): Error pid:=%i not supported\n", __func__, pid);
+
+    return NULL;
+}
+
+const char * usb_get_pid_name(int pid)
+{
+    return get_pid_name(pid);
+}
+
+static int get_pid_by_name(const char* name)
+{
+    char* p = strchr(name, '-');
+    if (p == NULL) {
+        fprintf(stderr, "%s(): Error name (%s) not supported\n", __func__, name);
+        return -1;
+    }
+    p++; //advance to point to the name
+    int i;
+    int n = sizeof(supportedDevices)/sizeof(supportedDevices[0]);
+
+    for (i = 0; i < n; i++)
+    {
+        if (strcmp(supportedDevices[i].name, p) == 0)
+            return supportedDevices[i].pid;
+    }
+    return -1;
+}
+
+static int is_pid_supported(int pid)
+{
+    int n = sizeof(supportedDevices)/sizeof(supportedDevices[0]);
+    int i;
+    for (i = 0; i < n; i++) {
+        if (supportedDevices[i].pid == pid)
+            return 1;
+    }
+    return 0;
+}
+
+static int isMyriadDevice(const int idVendor, const int idProduct) {
+    // Device is Myriad and pid supported
+    if (idVendor == DEFAULT_VID && is_pid_supported(idProduct) == 1)
+        return 1;
+    // Device is Myriad and device booted
+    if (idVendor == DEFAULT_OPENVID && idProduct == DEFAULT_OPENPID)
+        return 1;
+    return 0;
+}
+
+static int isBootedMyriadDevice(const int idVendor, const int idProduct) {
+    // Device is Myriad, booted device pid
+    if (idVendor == DEFAULT_VID && idProduct == DEFAULT_OPENPID) {
+        return 1;
+    }
+    return 0;
+}
+static int isNotBootedMyriadDevice(const int idVendor, const int idProduct) {
+    // Device is Myriad, pid supported and it's is not booted device
+    if (idVendor == DEFAULT_VID && is_pid_supported(idProduct) == 1
+            && idProduct != DEFAULT_OPENPID) {
+        return 1;
+    }
+    return 0;
+}
+
+#if (!defined(_WIN32) && !defined(_WIN64) )
+static const char *gen_addr(libusb_device *dev, int pid)
+{
+    static char buff[4 * 7 + 7];    // '255-' x 7 (also gives us nul-terminator for last entry)
+                                    // 7 => to add "-maXXXX"
+    uint8_t pnums[7];
+    int pnum_cnt, i;
+    char *p;
+
+    pnum_cnt = libusb_get_port_numbers(dev, pnums, 7);
+    if (pnum_cnt == LIBUSB_ERROR_OVERFLOW) {
+        // shouldn't happen!
+        strcpy(buff, "<error>");
+        return buff;
+    }
+    p = buff;
+
+    uint8_t bus = libusb_get_bus_number(dev);
+    p += sprintf(p, "%u.", bus);
+
+    for (i = 0; i < pnum_cnt - 1; i++)
+        p += sprintf(p, "%u.", pnums[i]);
+
+    p += sprintf(p, "%u", pnums[i]);
+    const char* dev_name = get_pid_name(pid);
+
+    if (dev_name != NULL) {
+        sprintf(p, "-%s", dev_name);
+    } else {
+        strcpy(buff, "<error>");
+        return buff;
+    }
+
+    return buff;
+}
+
+static pthread_mutex_t globalMutex = PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * @brief Find usb device address
+ * @param addr      Device name (address) which would be returned
+ *
+ * @details
+ * Find any device (device = 0):
+ * <br> 1) Any myriad device:                    vid = AUTO_VID & pid = AUTO_PID
+ * <br> 2) Any not booted myriad device:         vid = AUTO_VID & pid = AUTO_UNBOOTED_PID
+ * <br> 3) Any booted myriad device:             vid = AUTO_VID & pid = DEFAULT_OPENPID
+ * <br> 4) Specific Myriad 2 or Myriad X device: vid = AUTO_VID & pid = DEFAULT_UNBOOTPID_2485 or DEFAULT_UNBOOTPID_2150
+ * <br><br> Find specific device (device != 0):
+ * <br> device arg should be not null, search by addr (name) and return device struct
+ *
+ * @note
+ * Index can be used to iterate through all connected myriad devices and save their names.
+ * It will loop only over suitable devices specified by vid and pid
+ */
+usbBootError_t usb_find_device_with_bcd(unsigned idx, char *addr,
+        unsigned addrsize, void **device, int vid, int pid, uint16_t* bcdusb) {
+    if (pthread_mutex_lock(&globalMutex)) {
+        fprintf(stderr, "Mutex lock failed\n");
+        return USB_BOOT_ERROR;
+    }
+
+    static libusb_device **devs = NULL;
+    libusb_device *dev = NULL;
+    struct libusb_device_descriptor desc;
+    int count = 0;
+    size_t i;
+    int res;
+
+    if (!initialized) {
+        if (usb_loglevel)
+            fprintf(stderr, "Library has not been initialized when loaded\n");
+        if (pthread_mutex_unlock(&globalMutex)) {
+            fprintf(stderr, "Mutex unlock failed\n");
+        }
+        return USB_BOOT_ERROR;
+    }
+
+    // Update device list if empty or if indx 0
+    if (!devs || idx == 0) {
+        if (devs) {
+            libusb_free_device_list(devs, 1);
+            devs = 0;
+        }
+        if ((res = libusb_get_device_list(NULL, &devs)) < 0) {
+            if (usb_loglevel)
+                fprintf(stderr, "Unable to get USB device list: %s\n", libusb_strerror(res));
+            if (pthread_mutex_unlock(&globalMutex)) {
+                fprintf(stderr, "Mutex unlock failed\n");
+            }
+            return USB_BOOT_ERROR;
+        }
+    }
+
+    // Loop over all usb devices, increase count only if myriad device
+    i = 0;
+    while ((dev = devs[i++]) != NULL) {
+        if ((res = libusb_get_device_descriptor(dev, &desc)) < 0) {
+            if (usb_loglevel)
+                fprintf(stderr, "Unable to get USB device descriptor: %s\n", libusb_strerror(res));
+            continue;
+        }
+
+        // If found device have the same id and vid as input
+        if ( (desc.idVendor == vid && desc.idProduct == pid)
+                // Any myriad device
+                || (vid == AUTO_VID && pid == AUTO_PID
+                        && isMyriadDevice(desc.idVendor, desc.idProduct))
+                // Any not booted myriad device
+                || (vid == AUTO_VID && (pid == AUTO_UNBOOTED_PID)
+                        && isNotBootedMyriadDevice(desc.idVendor, desc.idProduct))
+                // Any not booted with specific pid
+                || (vid == AUTO_VID && pid == desc.idProduct
+                        && isNotBootedMyriadDevice(desc.idVendor, desc.idProduct))
+                // Any booted device
+                || (vid == AUTO_VID && pid == DEFAULT_OPENPID
+                        && isBootedMyriadDevice(desc.idVendor, desc.idProduct)) )
+        {
+            if (device) {
+                const char *caddr = gen_addr(dev, get_pid_by_name(addr));
+                // If the same add as input
+                if (!strcmp(caddr, addr)) {
+                    if (usb_loglevel > 1) {
+                        fprintf(stderr, "Found Address: %s - VID/PID %04x:%04x\n",
+                         addr, desc.idVendor, desc.idProduct);
+                    }
+                    libusb_ref_device(dev);
+                    libusb_free_device_list(devs, 1);
+                    if (bcdusb)
+                        *bcdusb = desc.bcdUSB;
+                    *device = dev;
+                    devs = 0;
+                    if (pthread_mutex_unlock(&globalMutex)) {
+                        fprintf(stderr, "Mutex unlock failed\n");
+                    }
+                    return USB_BOOT_SUCCESS;
+                }
+            } else if (idx == count) {
+                const char *caddr = gen_addr(dev, desc.idProduct);
+                if (usb_loglevel > 1)
+                    fprintf(stderr, "Device %d Address: %s - VID/PID %04x:%04x\n",
+                            idx, caddr, desc.idVendor, desc.idProduct);
+                strncpy(addr, caddr, addrsize - 1);
+                addr[addrsize - 1] = '\0';
+                if (pthread_mutex_unlock(&globalMutex)) {
+                    fprintf(stderr, "Mutex unlock failed\n");
+                }
+                return USB_BOOT_SUCCESS;
+            }
+            count++;
+        }
+    }
+    libusb_free_device_list(devs, 1);
+    devs = 0;
+    if (pthread_mutex_unlock(&globalMutex)) {
+        fprintf(stderr, "Mutex unlock failed\n");
+    }
+    return USB_BOOT_DEVICE_NOT_FOUND;
+}
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64) )
+usbBootError_t usb_find_device(unsigned idx, char *addr, unsigned addrsize, void **device, int vid, int pid)
+{
+    // TODO There is no global mutex as in linux version
+    int res;
+    // 2 => vid
+    // 2 => pid
+    // '255-' x 7 (also gives us nul-terminator for last entry)
+    // 7 => to add "-maXXXX"
+    static uint8_t devs[128][2 + 2 + 4 * 7 + 7] = { 0 };//to store ven_id,dev_id;
+    static int devs_cnt = 0;
+    int count = 0;
+    size_t i;
+
+    if(!initialized)
+    {
+        if(usb_loglevel)
+            fprintf(stderr, "Library has not been initialized when loaded\n");
+        return USB_BOOT_ERROR;
+    }
+    if (devs_cnt == 0 || idx == 0) {
+        devs_cnt = 0;
+        if (((res = usb_list_devices(vid, pid, &devs)) < 0)) {
+            if (usb_loglevel)
+                fprintf(stderr, "Unable to get USB device list: %s\n", libusb_strerror(res));
+            return USB_BOOT_ERROR;
+        }
+        devs_cnt = res;
+    } else {
+        res = devs_cnt;
+    }
+    i = 0;
+
+    while (res-- > 0) {
+        int idVendor = (int)(devs[res][0] << 8 | devs[res][1]);
+        int idProduct = (devs[res][2] << 8 | devs[res][3]);
+
+        // If found device have the same id and vid as input
+        if ( (idVendor == vid && idProduct == pid)
+                // Any myriad device
+                || (vid == AUTO_VID && pid == AUTO_PID
+                        && isMyriadDevice(idVendor, idProduct))
+                // Any unbooted myriad device
+                || (vid == AUTO_VID && (pid == AUTO_UNBOOTED_PID)
+                        && isNotBootedMyriadDevice(idVendor, idProduct))
+                // Any unbooted with same pid
+                || (vid == AUTO_VID && pid == idProduct
+                        && isNotBootedMyriadDevice(idVendor, idProduct))
+                // Any booted device
+                || (vid == AUTO_VID && pid == DEFAULT_OPENPID
+                        && isBootedMyriadDevice(idVendor, idProduct)) )
+        {
+            if (device) {
+                const char *caddr = &devs[res][4];
+                if (strstr(addr, caddr))
+                {
+                    if (usb_loglevel > 1)
+                        fprintf(stderr, "Found Address: %s - VID/PID %04x:%04x\n", caddr, (int)(devs[res][0] << 8 | devs[res][1]), (int)(devs[res][2] << 8 | devs[res][3]));
+                    *device = enumerate_usb_device(vid, pid, caddr, 0);
+                    devs_cnt = 0;
+                    return USB_BOOT_SUCCESS;
+                }
+            }
+            else if (idx == count)
+            {
+                const char *caddr = &devs[res][4];
+                if (usb_loglevel > 1)
+                    fprintf(stderr, "Device %d Address: %s - VID/PID %04x:%04x\n", idx, caddr, (int)(devs[res][0] << 8 | devs[res][1]), (int)(devs[res][2] << 8 | devs[res][3]));
+                strncpy(addr, caddr, addrsize - 1);
+                addr[addrsize - 1] = '\0';
+                return USB_BOOT_SUCCESS;
+            }
+            count++;
+        }
+    }
+    devs_cnt = 0;
+    return USB_BOOT_DEVICE_NOT_FOUND;
+}
+#endif
+
+
+#if (!defined(_WIN32) && !defined(_WIN64) )
+static libusb_device_handle *usb_open_device(libusb_device *dev, uint8_t *endpoint, char *err_string_buff, int err_max_len)
+{
+    struct libusb_config_descriptor *cdesc;
+    const struct libusb_interface_descriptor *ifdesc;
+    libusb_device_handle *h = NULL;
+    int res, i;
+
+    if((res = libusb_open(dev, &h)) < 0)
+    {
+        snprintf(err_string_buff, err_max_len, "cannot open device: %s\n", libusb_strerror(res));
+        return 0;
+    }
+    if((res = libusb_set_configuration(h, 1)) < 0)
+    {
+        snprintf(err_string_buff, err_max_len, "setting config 1 failed: %s\n", libusb_strerror(res));
+        libusb_close(h);
+        return 0;
+    }
+    if((res = libusb_claim_interface(h, 0)) < 0)
+    {
+        snprintf(err_string_buff, err_max_len, "claiming interface 0 failed: %s\n", libusb_strerror(res));
+        libusb_close(h);
+        return 0;
+    }
+    if((res = libusb_get_config_descriptor(dev, 0, &cdesc)) < 0)
+    {
+        snprintf(err_string_buff, err_max_len, "Unable to get USB config descriptor: %s\n", libusb_strerror(res));
+        libusb_close(h);
+        return 0;
+    }
+    ifdesc = cdesc->interface->altsetting;
+    for(i=0; i<ifdesc->bNumEndpoints; i++)
+    {
+        if(usb_loglevel > 1)
+            fprintf(stderr, "Found EP 0x%02x : max packet size is %u bytes\n",
+                ifdesc->endpoint[i].bEndpointAddress, ifdesc->endpoint[i].wMaxPacketSize);
+        if((ifdesc->endpoint[i].bmAttributes & LIBUSB_TRANSFER_TYPE_MASK) != LIBUSB_TRANSFER_TYPE_BULK)
+            continue;
+        if( !(ifdesc->endpoint[i].bEndpointAddress & LIBUSB_ENDPOINT_DIR_MASK) )
+        {
+            *endpoint = ifdesc->endpoint[i].bEndpointAddress;
+            bulk_chunklen = ifdesc->endpoint[i].wMaxPacketSize;
+            libusb_free_config_descriptor(cdesc);
+            return h;
+        }
+    }
+    libusb_free_config_descriptor(cdesc);
+    strcpy(err_string_buff, "Unable to find BULK OUT endpoint\n");
+    libusb_close(h);
+    return 0;
+}
+#endif
+// timeout: -1 = no (infinite) timeout, 0 = must happen immediately
+
+
+#if (!defined(_WIN32) && !defined(_WIN64) )
+static int wait_findopen(const char *device_address, int timeout, libusb_device **dev, libusb_device_handle **devh, uint8_t *endpoint,uint16_t* bcdusb)
+#else
+static int wait_findopen(const char *device_address, int timeout, libusb_device **dev, libusb_device_handle **devh, uint8_t *endpoint)
+#endif
+{
+    int i, rc;
+    char last_open_dev_err[OPEN_DEV_ERROR_MESSAGE_LENGTH];
+    double elapsedTime = 0;
+    highres_time_t t1, t2;
+
+    if (device_address == NULL) {
+        return USB_BOOT_ERROR;
+    }
+
+       usleep(100000);
+       if(usb_loglevel > 1)
+       {
+               if(timeout == -1)
+                       fprintf(stderr, "Starting wait for connect, no timeout\n");
+               else if(timeout == 0)
+                       fprintf(stderr, "Trying to connect\n");
+               else fprintf(stderr, "Starting wait for connect with %ums timeout\n", timeout);
+       }
+       last_open_dev_err[0] = 0;
+       i = 0;
+       for(;;)
+       {
+    highres_gettime(&t1);
+    int addr_size = strlen(device_address);
+#if (!defined(_WIN32) && !defined(_WIN64) )
+        rc = usb_find_device_with_bcd(0, (char*)device_address, addr_size, (void**)dev,
+            DEFAULT_VID, get_pid_by_name(device_address), bcdusb);
+#else
+        rc = usb_find_device(0, (char *)device_address, addr_size, (void **)dev,
+            DEFAULT_VID, get_pid_by_name(device_address));
+#endif
+               if(rc < 0)
+                       return USB_BOOT_ERROR;
+               if(!rc)
+               {
+#if (!defined(_WIN32) && !defined(_WIN64) )
+            *devh = usb_open_device(*dev, endpoint, last_open_dev_err, OPEN_DEV_ERROR_MESSAGE_LENGTH);
+#else
+            *devh = usb_open_device(*dev, endpoint, 0, last_open_dev_err);
+#endif
+            if(*devh != NULL)
+                       {
+                               if(usb_loglevel > 1)
+                                       fprintf(stderr, "Found and opened device\n");
+                               return 0;
+                       }
+#if (!defined(_WIN32) && !defined(_WIN64) )
+                       libusb_unref_device(*dev);
+#endif
+               }
+        highres_gettime(&t2);
+        elapsedTime += highres_elapsed_ms(&t1, &t2);
+
+               if(timeout != -1)
+               {
+                       if(usb_loglevel)
+                       {
+                               if(last_open_dev_err[0])
+                                       fprintf(stderr, "%s", last_open_dev_err);
+                               fprintf(stderr, "error: device not found!\n");
+                       }
+                       return rc ? USB_BOOT_DEVICE_NOT_FOUND : USB_BOOT_TIMEOUT;
+        } else if (elapsedTime > (double)timeout) {
+            return rc ? USB_BOOT_DEVICE_NOT_FOUND : USB_BOOT_TIMEOUT;
+               }
+               i++;
+               usleep(100000);
+       }
+       return 0;
+}
+
+#if (!defined(_WIN32) && !defined(_WIN64) )
+static int send_file(libusb_device_handle* h, uint8_t endpoint, const uint8_t* tx_buf, unsigned filesize,uint16_t bcdusb)
+#else
+static int send_file(libusb_device_handle *h, uint8_t endpoint, const uint8_t *tx_buf, unsigned filesize)
+#endif
+{
+       const uint8_t *p;
+       int rc;
+       int wb, twb, wbr;
+       double elapsedTime;
+       highres_time_t t1, t2;
+    unsigned int bulk_chunklen=DEFAULT_CHUNKSZ;
+    elapsedTime = 0;
+    twb = 0;
+    p = tx_buf;
+
+#if (!defined(_WIN32) && !defined(_WIN64) )
+    if(bcdusb < 0x200) {
+        bulk_chunklen = USB1_CHUNKSZ;
+    }
+#endif
+       if(usb_loglevel > 1)
+               fprintf(stderr, "Performing bulk write of %u bytes...\n", filesize);
+       while(twb < filesize)
+       {
+               highres_gettime(&t1);
+               wb = filesize - twb;
+               if(wb > bulk_chunklen)
+                       wb = bulk_chunklen;
+               wbr = 0;
+#if (!defined(_WIN32) && !defined(_WIN64) )
+        rc = libusb_bulk_transfer(h, endpoint, (void *)p, wb, &wbr, write_timeout);
+#else
+        rc = usb_bulk_write(h, endpoint, (void *)p, wb, &wbr, write_timeout);
+#endif
+               if(rc || (wb != wbr))
+               {
+                       if(rc == LIBUSB_ERROR_NO_DEVICE)
+                               break;
+                       if(usb_loglevel)
+                               fprintf(stderr, "bulk write: %s (%d bytes written, %d bytes to write)\n", libusb_strerror(rc), wbr, wb);
+                       if(rc == LIBUSB_ERROR_TIMEOUT)
+                               return USB_BOOT_TIMEOUT;
+                       else return USB_BOOT_ERROR;
+               }
+               highres_gettime(&t2);
+               elapsedTime += highres_elapsed_ms(&t1, &t2);
+               if (elapsedTime > DEFAULT_SEND_FILE_TIMEOUT) {
+                   return USB_BOOT_TIMEOUT;
+               }
+               twb += wbr;
+               p += wbr;
+       }
+       if(usb_loglevel > 1)
+       {
+               double MBpS = ((double)filesize / 1048576.) / (elapsedTime * 0.001);
+               fprintf(stderr, "Successfully sent %u bytes of data in %lf ms (%lf MB/s)\n", filesize, elapsedTime, MBpS);
+       }
+       return 0;
+}
+
+int usb_boot(const char *addr, const void *mvcmd, unsigned size)
+{
+    int rc = 0;
+    uint8_t endpoint;
+
+#if (defined(_WIN32) || defined(_WIN64) )
+    void *dev = NULL;
+    struct _usb_han *h;
+
+    rc = wait_findopen(addr, connect_timeout, &dev, &h, &endpoint);
+    if(rc) {
+        usb_close_device(h);
+        usb_free_device(dev);
+        return rc;
+    }
+    rc = send_file(h, endpoint, mvcmd, size);
+    usb_close_device(h);
+    usb_free_device(dev);
+#else
+    libusb_device *dev;
+    libusb_device_handle *h;
+    uint16_t bcdusb=-1;
+
+    rc = wait_findopen(addr, connect_timeout, &dev, &h, &endpoint,&bcdusb);
+
+    if(rc) {
+        return rc;
+    }
+    rc = send_file(h, endpoint, mvcmd, size,bcdusb);
+       libusb_release_interface(h, 0);
+       libusb_close(h);
+       libusb_unref_device(dev);
+#endif
+    return rc;
+}
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/usb_boot.h b/inference-engine/thirdparty/movidius/XLink/pc/usb_boot.h
new file mode 100644 (file)
index 0000000..6dd0273
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int usb_loglevel;
+
+#define AUTO_VID                    0
+#define AUTO_PID                    0
+#define AUTO_UNBOOTED_PID           -1
+
+#define DEFAULT_OPENVID                                0x03E7
+#define DEFAULT_OPENPID                                0xf63b          // Once opened in VSC mode, VID/PID change
+
+#define DEFAULT_UNBOOTVID                      0x03E7
+#define DEFAULT_UNBOOTPID_2485         0x2485
+#define DEFAULT_UNBOOTPID_2150         0x2150
+
+
+typedef enum usbBootError {
+    USB_BOOT_SUCCESS = 0,
+    USB_BOOT_ERROR,
+    USB_BOOT_DEVICE_NOT_FOUND,
+    USB_BOOT_TIMEOUT
+} usbBootError_t;
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+usbBootError_t usb_find_device_with_bcd(unsigned idx, char *addr, unsigned addrsize, void **device, int vid, int pid,unsigned short* bcdusb);
+#else
+usbBootError_t usb_find_device(unsigned idx, char *addr, unsigned addrsize, void **device, int vid, int pid);
+#endif
+int usb_boot(const char *addr, const void *mvcmd, unsigned size);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLink.c b/inference-engine/thirdparty/movidius/XLink/shared/XLink.c
new file mode 100644 (file)
index 0000000..2f44701
--- /dev/null
@@ -0,0 +1,1499 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+
+#include "XLink.h"
+
+#include "stdio.h"
+#include "stdint.h"
+#include "string.h"
+#include "time.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#if (defined(_WIN32) || defined(_WIN64))
+#include "win_pthread.h"
+#include "win_semaphore.h"
+#include "gettime.h"
+#else
+#include <pthread.h>
+#include <semaphore.h>
+#endif
+#if (defined(_WIN32) || defined(_WIN64))
+#include "gettime.h"
+#endif
+#include "mvMacros.h"
+#include "XLinkPlatform.h"
+#include "XLinkDispatcher.h"
+#define _XLINK_ENABLE_PRIVATE_INCLUDE_
+#include "XLinkPrivateDefines.h"
+
+#define MVLOG_UNIT_NAME xLink
+#include "mvLog.h"
+
+#define USB_DATA_TIMEOUT 10000
+#define CIRCULAR_INCREMENT(x,maxVal) \
+    { \
+         x++; \
+         if (x == maxVal) \
+             x = 0; \
+    }
+//avoid problems with unsigned. first compare and then give the nuw value
+#define CIRCULAR_DECREMENT(x,maxVal) \
+{ \
+    if (x == 0) \
+        x = maxVal; \
+    else \
+        x--; \
+}
+#define EXTRACT_IDS(streamId, linkId) \
+{ \
+    linkId = (streamId >> 24) & 0XFF; \
+    streamId = streamId & 0xFFFFFF; \
+}
+
+#define COMBIN_IDS(streamId, linkid) \
+     streamId = streamId | ((linkid & 0xFF) << 24);
+
+#define DEFAULT_TIMEOUT ((unsigned int)-1)
+#define MAX_PATH_LENGTH (255)
+
+static unsigned int glCommonTimeOutMsec = 1000;
+static unsigned int glDeviceOpenTimeOutMsec = 5000;
+static unsigned int glAllocateGraphTimeOutMsec = 12000;
+
+
+XLinkError_t XLinkSetCommonTimeOutMsec(unsigned int msec) {
+    glCommonTimeOutMsec = msec;
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkSetDeviceOpenTimeOutMsec(unsigned int msec) {
+    glDeviceOpenTimeOutMsec = msec;
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkSetAllocateGraphTimeOutMsec(unsigned int msec) {
+    glAllocateGraphTimeOutMsec = msec;
+    return X_LINK_SUCCESS;
+}
+
+int XLinkWaitSem(sem_t* sem)
+{
+    ASSERT_X_LINK_R(sem != NULL, -1);
+
+    if (glCommonTimeOutMsec == 0)
+    {
+        return sem_wait(sem);
+    }
+    else
+    {
+        struct timespec ts;
+        uint64_t timeout_counter =  (uint64_t)glCommonTimeOutMsec * 1000 * 1000;
+        uint64_t overflow;
+
+        if (clock_gettime(CLOCK_REALTIME, &ts) == -1)
+        {
+            return -1;
+        }
+        overflow = timeout_counter + ts.tv_nsec;
+        ts.tv_sec += overflow / 1000000000ul;
+        ts.tv_nsec = overflow % 1000000000ul;
+
+        return sem_timedwait(sem, &ts);
+    }
+}
+
+int XLinkWaitSemUserMode(sem_t* sem, unsigned int timeout)
+{
+    ASSERT_X_LINK_R(sem != NULL, -1);
+
+    if (timeout == 0)
+    {
+        return sem_wait(sem);
+    }
+    else if (timeout == DEFAULT_TIMEOUT)
+    {
+        return XLinkWaitSem(sem);
+    }
+    else
+    {
+        struct timespec ts;
+        uint64_t timeout_counter =  (uint64_t)timeout * 1000 * 1000;
+        uint64_t overflow;
+
+        if (clock_gettime(CLOCK_REALTIME, &ts) == -1)
+        {
+            return -1;
+        }
+        overflow = timeout_counter + ts.tv_nsec;
+        ts.tv_sec += overflow / 1000000000ul;
+        ts.tv_nsec = overflow % 1000000000ul;
+
+        return sem_timedwait(sem, &ts);
+    }
+}
+
+int dispatcherLocalEventGetResponse(xLinkEvent_t* event, xLinkEvent_t* response);
+int dispatcherRemoteEventGetResponse(xLinkEvent_t* event, xLinkEvent_t* response);
+//adds a new event with parameters and returns event id
+int dispatcherEventSend(xLinkEvent_t* event);
+streamDesc_t* getStreamById(void* fd, streamId_t id);
+void releaseStream(streamDesc_t*);
+int addNewPacketToStream(streamDesc_t* stream, void* buffer, uint32_t size);
+
+static XLinkError_t checkEventHeader(xLinkEventHeader_t header);
+
+struct dispatcherControlFunctions controlFunctionTbl;
+XLinkGlobalHandler_t* glHandler; //TODO need to either protect this with semaphor
+                                 // or make profiling data per device
+linkId_t nextUniqueLinkId = 0; //incremental number, doesn't get decremented.
+
+xLinkDesc_t availableXLinks[MAX_LINKS];
+
+sem_t  pingSem; //to b used by myriad
+
+static void copy_string(char *dst, const char *src, size_t maxLength) {
+    strncpy(dst, src, maxLength - 1);
+    dst[maxLength - 1] = '\0';
+}
+
+char* TypeToStr(int type)
+{
+    switch(type)
+    {
+        case XLINK_WRITE_REQ:     return "XLINK_WRITE_REQ";
+        case XLINK_READ_REQ:      return "XLINK_READ_REQ";
+        case XLINK_READ_REL_REQ:  return "XLINK_READ_REL_REQ";
+        case XLINK_CREATE_STREAM_REQ:return "XLINK_CREATE_STREAM_REQ";
+        case XLINK_CLOSE_STREAM_REQ: return "XLINK_CLOSE_STREAM_REQ";
+        case XLINK_PING_REQ:         return "XLINK_PING_REQ";
+        case XLINK_RESET_REQ:        return "XLINK_RESET_REQ";
+        case XLINK_REQUEST_LAST:     return "XLINK_REQUEST_LAST";
+        case XLINK_WRITE_RESP:   return "XLINK_WRITE_RESP";
+        case XLINK_READ_RESP:     return "XLINK_READ_RESP";
+        case XLINK_READ_REL_RESP: return "XLINK_READ_REL_RESP";
+        case XLINK_CREATE_STREAM_RESP: return "XLINK_CREATE_STREAM_RESP";
+        case XLINK_CLOSE_STREAM_RESP:  return "XLINK_CLOSE_STREAM_RESP";
+        case XLINK_PING_RESP:  return "XLINK_PING_RESP";
+        case XLINK_RESET_RESP: return "XLINK_RESET_RESP";
+        case XLINK_RESP_LAST:  return "XLINK_RESP_LAST";
+        default:
+        break;
+    }
+    return "";
+}
+
+static XLinkError_t parseUsbLinkPlatformError(xLinkPlatformErrorCode_t rc) {
+    switch (rc) {
+        case X_LINK_PLATFORM_SUCCESS:
+            return X_LINK_SUCCESS;
+        case X_LINK_PLATFORM_DEVICE_NOT_FOUND:
+            return X_LINK_DEVICE_NOT_FOUND;
+        case X_LINK_PLATFORM_TIMEOUT:
+            return X_LINK_TIMEOUT;
+        default:
+            return X_LINK_ERROR;
+    }
+}
+
+const char* XLinkErrorToStr(XLinkError_t rc) {
+    switch (rc) {
+        case X_LINK_SUCCESS:
+            return "X_LINK_SUCCESS";
+        case X_LINK_ALREADY_OPEN:
+            return "X_LINK_ALREADY_OPEN";
+        case X_LINK_DEVICE_NOT_FOUND:
+            return "X_LINK_DEVICE_NOT_FOUND";
+        case X_LINK_TIMEOUT:
+            return "X_LINK_TIMEOUT";
+        case X_LINK_OUT_OF_MEMORY:
+            return "X_LINK_OUT_OF_MEMORY";
+        case X_LINK_ERROR:
+        default:
+            return "X_LINK_ERROR";
+    }
+}
+
+/*#################################################################################
+###################################### INTERNAL ###################################
+##################################################################################*/
+
+static float timespec_diff(struct timespec *start, struct timespec *stop)
+{
+    if ((stop->tv_nsec - start->tv_nsec) < 0) {
+        start->tv_sec = stop->tv_sec - start->tv_sec - 1;
+        start->tv_nsec = stop->tv_nsec - start->tv_nsec + 1000000000;
+    } else {
+        start->tv_sec = stop->tv_sec - start->tv_sec;
+        start->tv_nsec = stop->tv_nsec - start->tv_nsec;
+    }
+
+    return start->tv_nsec/ 1000000000.0 + start->tv_sec;
+}
+
+int handleIncomingEvent(xLinkEvent_t* event){
+    //this function will be dependent whether this is a client or a Remote
+    //specific actions to this peer
+    mvLog(MVLOG_DEBUG, "%s, size %d, streamId %d.\n", TypeToStr(event->header.type), event->header.size, event->header.streamId);
+    void* buffer ;
+    streamDesc_t* stream ;
+    switch (event->header.type){
+    case XLINK_WRITE_REQ:
+        /*If we got here, we will read the data no matter what happens.
+          If we encounter any problems we will still read the data to keep
+          the communication working but send a NACK.*/
+        stream = getStreamById(event->xLinkFD, event->header.streamId);
+        ASSERT_X_LINK(stream);
+
+        stream->localFillLevel += event->header.size;
+        mvLog(MVLOG_DEBUG,"Got write, current local fill level is %u out of %u %u\n", stream->localFillLevel, stream->readSize, stream->writeSize);
+
+        buffer = allocateData(ALIGN_UP(event->header.size, __CACHE_LINE_SIZE), __CACHE_LINE_SIZE);
+        if (buffer == NULL){
+            mvLog(MVLOG_FATAL,"out of memory\n");
+            ASSERT_X_LINK(0);
+        }
+        int sc = XLinkRead(event->xLinkFD, buffer, event->header.size, USB_DATA_TIMEOUT);
+        if(sc < 0){
+            mvLog(MVLOG_ERROR,"%s() Read failed %d\n", __func__, (int)sc);
+            deallocateData(buffer, ALIGN_UP(event->header.size, __CACHE_LINE_SIZE), __CACHE_LINE_SIZE);
+            ASSERT_X_LINK(0);
+        }
+
+        event->data = buffer;
+        if (addNewPacketToStream(stream, buffer, event->header.size)){
+            mvLog(MVLOG_WARN,"No more place in stream. release packet\n");
+            deallocateData(buffer, ALIGN_UP(event->header.size, __CACHE_LINE_SIZE), __CACHE_LINE_SIZE);
+            event->header.flags.bitField.ack = 0;
+            event->header.flags.bitField.nack = 1;
+            assert(0);
+        }
+        releaseStream(stream);
+        break;
+    case XLINK_READ_REQ:
+        break;
+    case XLINK_READ_REL_REQ:
+        break;
+    case XLINK_CREATE_STREAM_REQ:
+        break;
+    case XLINK_CLOSE_STREAM_REQ:
+        break;
+    case XLINK_PING_REQ:
+        break;
+    case XLINK_RESET_REQ:
+        break;
+    case XLINK_WRITE_RESP:
+        break;
+    case XLINK_READ_RESP:
+        break;
+    case XLINK_READ_REL_RESP:
+        break;
+    case XLINK_CREATE_STREAM_RESP:
+        break;
+    case XLINK_CLOSE_STREAM_RESP:
+        break;
+    case XLINK_PING_RESP:
+        break;
+    case XLINK_RESET_RESP:
+        break;
+    default:
+        ASSERT_X_LINK(0);
+    }
+    //adding event for the scheduler. We let it know that this is a remote event
+    dispatcherAddEvent(EVENT_REMOTE, event);
+    return 0;
+}
+
+int dispatcherEventReceive(xLinkEvent_t* event){
+    static xLinkEvent_t prevEvent = {0};
+
+    int sc = XLinkRead(event->xLinkFD, &event->header, sizeof(event->header), 0);
+
+    mvLog(MVLOG_DEBUG,"Incoming event %p: %s %d %p prevEvent: %s %d %p\n",
+                                event,
+                                TypeToStr(event->header.type),
+                                (int)event->header.id,
+                                event->xLinkFD,
+                                TypeToStr(prevEvent.header.type),
+                                (int)prevEvent.header.id,
+                                prevEvent.xLinkFD);
+
+    if(sc < 0 && event->header.type == XLINK_RESET_RESP) {
+        return sc;
+    }
+
+    if(sc < 0){
+        mvLog(MVLOG_ERROR,"%s() Read failed %d | event %p %s\n", __func__, (int)sc, event, TypeToStr(event->header.type));
+        return sc;
+    }
+
+    if (prevEvent.header.id == event->header.id &&
+            prevEvent.header.type == event->header.type &&
+            prevEvent.xLinkFD == event->xLinkFD)
+    {
+        mvLog(MVLOG_FATAL,"Duplicate id detected. \n");
+    }
+
+    prevEvent = *event;
+    if (handleIncomingEvent(event) != 0) {
+        mvLog(MVLOG_WARN,"Failed to handle incoming event");
+    }
+
+    if(event->header.type == XLINK_RESET_REQ)
+    {
+
+#ifdef USE_PCIE
+       mvLog(MVLOG_DEBUG,"XLINK_RESET_REQ received - doing nothing, we dont want to reset device");
+#else
+        return -1;
+#endif // USE_PCIE
+    }
+
+    return 0;
+}
+
+int getLinkIndex(void* fd)
+{
+    int i;
+    for (i = 0; i < MAX_LINKS; i++)
+        if (availableXLinks[i].fd == fd)
+            return i;
+    return -1;
+}
+
+xLinkDesc_t* getLinkById(linkId_t id)
+{
+    int i;
+    for (i = 0; i < MAX_LINKS; i++)
+        if (availableXLinks[i].id == id)
+            return &availableXLinks[i];
+    return NULL;
+}
+xLinkDesc_t* getLink(void* fd)
+{
+    int i;
+    for (i = 0; i < MAX_LINKS; i++)
+        if (availableXLinks[i].fd == fd)
+            return &availableXLinks[i];
+    return NULL;
+}
+
+static linkId_t getNextAvailableLinkUniqueId()
+{
+    linkId_t start = nextUniqueLinkId;
+    do
+    {
+        int i;
+        nextUniqueLinkId++;
+        if (nextUniqueLinkId == INVALID_LINK_ID)
+        {
+            nextUniqueLinkId = 0;
+        }
+        for (i = 0; i < MAX_LINKS; i++)
+        {
+            if (availableXLinks[i].id != INVALID_LINK_ID &&
+                availableXLinks[i].id == nextUniqueLinkId)
+                break;
+        }
+        if (i >= MAX_LINKS)
+        {
+            return nextUniqueLinkId;
+        }
+    } while (start != nextUniqueLinkId);
+    mvLog(MVLOG_ERROR, "%s():- no next available link!\n", __func__);
+    return INVALID_LINK_ID;
+}
+
+static int getNextAvailableLinkIndex()
+{
+    int i;
+    for (i = 0; i < MAX_LINKS; i++)
+        if (availableXLinks[i].id == INVALID_LINK_ID)
+            return i;
+
+    mvLog(MVLOG_ERROR,"%s():- no next available link!\n", __func__);
+    return -1;
+}
+
+int getNextAvailableStreamIndex(xLinkDesc_t* link)
+{
+    if (link == NULL)
+        return -1;
+
+    int idx;
+    for (idx = 0; idx < XLINK_MAX_STREAMS; idx++) {
+        if (link->availableStreams[idx].id == INVALID_STREAM_ID)
+            return idx;
+    }
+
+    mvLog(MVLOG_DEBUG,"%s(): - no next available stream!\n", __func__);
+    return -1;
+}
+
+streamDesc_t* getStreamById(void* fd, streamId_t id)
+{
+    xLinkDesc_t* link = getLink(fd);
+    ASSERT_X_LINK_R(link != NULL, NULL);
+    int stream;
+    for (stream = 0; stream < XLINK_MAX_STREAMS; stream++) {
+        if (link->availableStreams[stream].id == id) {
+            if (XLinkWaitSem(&link->availableStreams[stream].sem))
+                return NULL;
+            return &link->availableStreams[stream];
+        }
+    }
+    return NULL;
+}
+
+streamDesc_t* getStreamByName(xLinkDesc_t* link, const char* name)
+{
+    ASSERT_X_LINK_R(link != NULL, NULL);
+    int stream;
+    for (stream = 0; stream < XLINK_MAX_STREAMS; stream++) {
+        if (link->availableStreams[stream].id != INVALID_STREAM_ID &&
+            strcmp(link->availableStreams[stream].name, name) == 0) {
+                if (XLinkWaitSem(&link->availableStreams[stream].sem))
+                    return NULL;
+                return &link->availableStreams[stream];
+        }
+    }
+    return NULL;
+}
+
+void releaseStream(streamDesc_t* stream)
+{
+    if (stream && stream->id != INVALID_STREAM_ID) {
+        sem_post(&stream->sem);
+    }
+    else {
+        mvLog(MVLOG_DEBUG,"trying to release a semaphore for a released stream\n");
+    }
+}
+
+streamId_t getStreamIdByName(xLinkDesc_t* link, const char* name)
+{
+    streamDesc_t* stream = getStreamByName(link, name);
+    streamId_t id;
+    if (stream) {
+        id = stream->id;
+        releaseStream(stream);
+        return id;
+    }
+    else
+        return INVALID_STREAM_ID;
+}
+
+streamPacketDesc_t* getPacketFromStream(streamDesc_t* stream)
+{
+    streamPacketDesc_t* ret = NULL;
+    if (stream->availablePackets)
+    {
+        ret = &stream->packets[stream->firstPacketUnused];
+        stream->availablePackets--;
+        CIRCULAR_INCREMENT(stream->firstPacketUnused,
+                            XLINK_MAX_PACKETS_PER_STREAM);
+        stream->blockedPackets++;
+    }
+    return ret;
+}
+
+void deallocateStream(streamDesc_t* stream)
+{
+    if (stream && stream->id != INVALID_STREAM_ID)
+    {
+        if (stream->readSize)
+        {
+            stream->readSize = 0;
+            stream->closeStreamInitiated = 0;
+        }
+    }
+}
+
+
+int releasePacketFromStream(streamDesc_t* stream, uint32_t* releasedSize)
+{
+    streamPacketDesc_t* currPack = &stream->packets[stream->firstPacket];
+    if(stream->blockedPackets == 0){
+        mvLog(MVLOG_ERROR,"There is no packet to release\n");
+        return 0; // ignore this, although this is a big problem on application side
+    }
+    stream->localFillLevel -= currPack->length;
+    mvLog(MVLOG_DEBUG,"Got release, current local fill level is %u out of %u %u\n", stream->localFillLevel, stream->readSize, stream->writeSize);
+
+    deallocateData(currPack->data, ALIGN_UP_INT32((int32_t)currPack->length, __CACHE_LINE_SIZE), __CACHE_LINE_SIZE);
+    CIRCULAR_INCREMENT(stream->firstPacket, XLINK_MAX_PACKETS_PER_STREAM);
+    stream->blockedPackets--;
+    *releasedSize = currPack->length;
+    return 0;
+}
+
+int isStreamSpaceEnoughFor(streamDesc_t* stream, uint32_t size)
+{
+    if(stream->remoteFillPacketLevel >= XLINK_MAX_PACKETS_PER_STREAM ||
+        stream->remoteFillLevel + size > stream->writeSize){
+        mvLog(MVLOG_DEBUG, "S%d: Not enough space in stream for %u: PKT %u, FILL %u SIZE %u\n",
+            stream->id, size, stream->remoteFillPacketLevel, stream->remoteFillLevel, stream->writeSize);
+        return 0;
+    }
+    else
+        return 1;
+}
+
+int addNewPacketToStream(streamDesc_t* stream, void* buffer, uint32_t size){
+    if (stream->availablePackets + stream->blockedPackets < XLINK_MAX_PACKETS_PER_STREAM)
+    {
+        stream->packets[stream->firstPacketFree].data = buffer;
+        stream->packets[stream->firstPacketFree].length = size;
+        CIRCULAR_INCREMENT(stream->firstPacketFree, XLINK_MAX_PACKETS_PER_STREAM);
+        stream->availablePackets++;
+        return 0;
+    }
+    return -1;
+}
+
+streamId_t allocateNewStream(void* fd,
+                            const char* name,
+                            uint32_t writeSize,
+                            uint32_t readSize,
+                            streamId_t forcedId)
+{
+    streamId_t streamId;
+    streamDesc_t* stream;
+    xLinkDesc_t* link = getLink(fd);
+    ASSERT_X_LINK_R(link != NULL, INVALID_STREAM_ID);
+
+    stream = getStreamByName(link, name);
+
+    if (stream != NULL)
+    {
+        /*the stream already exists*/
+        if ((writeSize > stream->writeSize && stream->writeSize != 0) ||
+            (readSize > stream->readSize && stream->readSize != 0))
+        {
+            mvLog(MVLOG_ERROR, "%s(): streamName Exists %d\n", __func__, (int)stream->id);
+            return INVALID_STREAM_ID;
+        }
+    }
+    else
+    {
+        int idx = getNextAvailableStreamIndex(link);
+
+        if (idx == -1)
+        {
+            return INVALID_STREAM_ID;
+        }
+        stream = &link->availableStreams[idx];
+        if (forcedId == INVALID_STREAM_ID)
+            stream->id = link->nextUniqueStreamId;
+        else
+            stream->id = forcedId;
+        link->nextUniqueStreamId++; //even if we didnt use a new one, we need to align with total number of  unique streams
+        int sem_initiated = strlen(stream->name) != 0;
+        strncpy(stream->name, name, MAX_NAME_LENGTH - 1);
+        stream->name[MAX_NAME_LENGTH - 1] = '\0';
+        stream->readSize = 0;
+        stream->writeSize = 0;
+        stream->remoteFillLevel = 0;
+        stream->remoteFillPacketLevel = 0;
+
+        stream->localFillLevel = 0;
+        stream->closeStreamInitiated = 0;
+        if (!sem_initiated) //if sem_init is called for already initiated sem, behavior is undefined
+            sem_init(&stream->sem, 0, 0);
+    }
+    if (readSize && !stream->readSize)
+    {
+        stream->readSize = readSize;
+    }
+    if (writeSize && !stream->writeSize)
+    {
+        stream->writeSize = writeSize;
+    }
+    streamId = stream->id;
+    releaseStream(stream);
+    return streamId;
+}
+
+static void setEventFailed(xLinkEvent_t * event )
+{
+    event->header.flags.bitField.localServe = 1;
+    event->header.flags.bitField.ack = 0;
+    event->header.flags.bitField.nack = 1;
+}
+
+//this function should be called only for remote requests
+int dispatcherLocalEventGetResponse(xLinkEvent_t* event, xLinkEvent_t* response)
+{
+    streamDesc_t* stream;
+    response->header.id = event->header.id;
+    mvLog(MVLOG_DEBUG, "%s\n",TypeToStr(event->header.type));
+    switch (event->header.type){
+    case XLINK_WRITE_REQ:
+        //in case local tries to write after it issues close (writeSize is zero)
+        stream = getStreamById(event->xLinkFD, event->header.streamId);
+        if(!stream){
+            mvLog(MVLOG_DEBUG, "stream %d has been closed!\n", event->header.streamId);
+            setEventFailed(event);
+            break;
+        }
+        if (stream->writeSize == 0)
+        {
+            event->header.flags.bitField.nack = 1;
+            event->header.flags.bitField.ack = 0;
+            // return -1 to don't even send it to the remote
+            releaseStream(stream);
+            return -1;
+        }
+        event->header.flags.bitField.ack = 1;
+        event->header.flags.bitField.nack = 0;
+        event->header.flags.bitField.localServe = 0;
+
+        if(!isStreamSpaceEnoughFor(stream, event->header.size)){
+            mvLog(MVLOG_DEBUG,"local NACK RTS. stream is full\n");
+            event->header.flags.bitField.block = 1;
+            event->header.flags.bitField.localServe = 1;
+        }else{
+            event->header.flags.bitField.block = 0;
+            stream->remoteFillLevel += event->header.size;
+            stream->remoteFillPacketLevel++;
+
+            mvLog(MVLOG_DEBUG,"Got local write remote fill level %u out of %u %u\n", stream->remoteFillLevel, stream->writeSize, stream->readSize);
+        }
+        releaseStream(stream);
+        break;
+    case XLINK_READ_REQ:
+        stream = getStreamById(event->xLinkFD, event->header.streamId);
+        if(!stream){
+            mvLog(MVLOG_DEBUG, "stream %d has been closed!\n", event->header.streamId);
+            setEventFailed(event);
+            break;
+        }
+        streamPacketDesc_t* packet = getPacketFromStream(stream);
+        if (packet){
+            //the read can be served with this packet
+            event->data = packet;
+            event->header.flags.bitField.ack = 1;
+            event->header.flags.bitField.nack = 0;
+            event->header.flags.bitField.block = 0;
+        }
+        else{
+            event->header.flags.bitField.block = 1;
+        }
+        releaseStream(stream);
+        event->header.flags.bitField.localServe = 1;
+        break;
+    case XLINK_READ_REL_REQ:
+        stream = getStreamById(event->xLinkFD, event->header.streamId);
+        ASSERT_X_LINK(stream);
+        uint32_t releasedSize = 0;
+        releasePacketFromStream(stream, &releasedSize);
+        event->header.size = releasedSize;
+        releaseStream(stream);
+        break;
+    case XLINK_CREATE_STREAM_REQ:
+        break;
+    case XLINK_CLOSE_STREAM_REQ:
+        stream = getStreamById(event->xLinkFD, event->header.streamId);
+
+        ASSERT_X_LINK(stream);
+        if (stream->remoteFillLevel != 0){
+            stream->closeStreamInitiated = 1;
+            event->header.flags.bitField.block = 1;
+            event->header.flags.bitField.localServe = 1;
+        }else{
+            event->header.flags.bitField.block = 0;
+            event->header.flags.bitField.localServe = 0;
+        }
+        releaseStream(stream);
+        break;
+    case XLINK_RESET_REQ:
+        mvLog(MVLOG_DEBUG,"XLINK_RESET_REQ - do nothing\n");
+        break;
+    case XLINK_PING_REQ:
+    case XLINK_WRITE_RESP:
+    case XLINK_READ_RESP:
+    case XLINK_READ_REL_RESP:
+    case XLINK_CREATE_STREAM_RESP:
+    case XLINK_CLOSE_STREAM_RESP:
+    case XLINK_PING_RESP:
+        break;
+    case XLINK_RESET_RESP:
+        //should not happen
+        event->header.flags.bitField.localServe = 1;
+        break;
+    default:
+        ASSERT_X_LINK(0);
+    }
+    return 0;
+}
+
+//this function should be called only for remote requests
+int dispatcherRemoteEventGetResponse(xLinkEvent_t* event, xLinkEvent_t* response)
+{
+    streamDesc_t* stream;
+    response->header.id = event->header.id;
+    response->header.flags.raw = 0;
+    mvLog(MVLOG_DEBUG, "%s\n",TypeToStr(event->header.type));
+
+    switch (event->header.type)
+    {
+        case XLINK_WRITE_REQ:
+            //let remote write immediately as we have a local buffer for the data
+            response->header.type = XLINK_WRITE_RESP;
+            response->header.size = event->header.size;
+            response->header.streamId = event->header.streamId;
+            response->header.flags.bitField.ack = 1;
+            response->xLinkFD = event->xLinkFD;
+
+            // we got some data. We should unblock a blocked read
+            int xxx = dispatcherUnblockEvent(-1,
+                                             XLINK_READ_REQ,
+                                             response->header.streamId,
+                                             event->xLinkFD);
+            (void) xxx;
+            mvLog(MVLOG_DEBUG,"unblocked from stream %d %d\n",
+                  (int)response->header.streamId, (int)xxx);
+            break;
+        case XLINK_READ_REQ:
+            break;
+        case XLINK_READ_REL_REQ:
+            response->header.flags.bitField.ack = 1;
+            response->header.flags.bitField.nack = 0;
+            response->header.type = XLINK_READ_REL_RESP;
+            response->xLinkFD = event->xLinkFD;
+            stream = getStreamById(event->xLinkFD,
+                                   event->header.streamId);
+            ASSERT_X_LINK(stream);
+            stream->remoteFillLevel -= event->header.size;
+            stream->remoteFillPacketLevel--;
+
+            mvLog(MVLOG_DEBUG,"Got remote release %u, remote fill level %u out of %u %u\n",
+                  event->header.size, stream->remoteFillLevel, stream->writeSize, stream->readSize);
+            releaseStream(stream);
+
+            dispatcherUnblockEvent(-1, XLINK_WRITE_REQ, event->header.streamId,
+                                    event->xLinkFD);
+            //with every released packet check if the stream is already marked for close
+            if (stream->closeStreamInitiated && stream->localFillLevel == 0)
+            {
+                mvLog(MVLOG_DEBUG,"%s() Unblock close STREAM\n", __func__);
+                int xxx = dispatcherUnblockEvent(-1,
+                                                 XLINK_CLOSE_STREAM_REQ,
+                                                 event->header.streamId,
+                                                 event->xLinkFD);
+                (void) xxx;
+            }
+            break;
+        case XLINK_CREATE_STREAM_REQ:
+            response->header.flags.bitField.ack = 1;
+            response->header.type = XLINK_CREATE_STREAM_RESP;
+            //write size from remote means read size for this peer
+            response->header.streamId = allocateNewStream(event->xLinkFD,
+                                                          event->header.streamName,
+                                                          0, event->header.size,
+                                                          INVALID_STREAM_ID);
+            response->xLinkFD = event->xLinkFD;
+            copy_string(response->header.streamName, event->header.streamName, MAX_NAME_LENGTH);
+            response->header.size = event->header.size;
+            mvLog(MVLOG_DEBUG,"creating stream %x\n", (int)response->header.streamId);
+            break;
+        case XLINK_CLOSE_STREAM_REQ:
+            {
+                response->header.type = XLINK_CLOSE_STREAM_RESP;
+                response->header.streamId = event->header.streamId;
+                response->xLinkFD = event->xLinkFD;
+
+                streamDesc_t* stream = getStreamById(event->xLinkFD,
+                                                     event->header.streamId);
+                if (!stream) {
+                    //if we have sent a NACK before, when the event gets unblocked
+                    //the stream might already be unavailable
+                    response->header.flags.bitField.ack = 1; //All is good, we are done
+                    response->header.flags.bitField.nack = 0;
+                    mvLog(MVLOG_DEBUG,"%s() got a close stream on aready closed stream\n", __func__);
+                } else {
+                    if (stream->localFillLevel == 0)
+                    {
+                        response->header.flags.bitField.ack = 1;
+                        response->header.flags.bitField.nack = 0;
+
+                        deallocateStream(stream);
+                        if (!stream->writeSize) {
+                            stream->id = INVALID_STREAM_ID;
+                        }
+                    }
+                    else
+                    {
+                        mvLog(MVLOG_DEBUG,"%s():fifo is NOT empty returning NACK \n", __func__);
+                        response->header.flags.bitField.nack = 1;
+                        stream->closeStreamInitiated = 1;
+                    }
+
+                    releaseStream(stream);
+                }
+                break;
+            }
+        case XLINK_PING_REQ:
+            response->header.type = XLINK_PING_RESP;
+            response->header.flags.bitField.ack = 1;
+            response->xLinkFD = event->xLinkFD;
+            sem_post(&pingSem);
+            break;
+        case XLINK_RESET_REQ:
+            mvLog(MVLOG_DEBUG,"reset request\n");
+            response->header.flags.bitField.ack = 1;
+            response->header.flags.bitField.nack = 0;
+            response->header.type = XLINK_RESET_RESP;
+            response->xLinkFD = event->xLinkFD;
+            // need to send the response, serve the event and then reset
+            break;
+        case XLINK_WRITE_RESP:
+            break;
+        case XLINK_READ_RESP:
+            break;
+        case XLINK_READ_REL_RESP:
+            break;
+        case XLINK_CREATE_STREAM_RESP:
+        {
+            // write_size from the response the size of the buffer from the remote
+            response->header.streamId = allocateNewStream(event->xLinkFD,
+                                                          event->header.streamName,
+                                                          event->header.size,0,
+                                                          event->header.streamId);
+            response->xLinkFD = event->xLinkFD;
+            break;
+        }
+        case XLINK_CLOSE_STREAM_RESP:
+        {
+            streamDesc_t* stream = getStreamById(event->xLinkFD,
+                                                 event->header.streamId);
+
+            if (!stream){
+                response->header.flags.bitField.nack = 1;
+                response->header.flags.bitField.ack = 0;
+                break;
+            }
+            stream->writeSize = 0;
+            if (!stream->readSize) {
+                response->header.flags.bitField.nack = 1;
+                response->header.flags.bitField.ack = 0;
+                stream->id = INVALID_STREAM_ID;
+                break;
+            }
+            releaseStream(stream);
+            break;
+        }
+        case XLINK_PING_RESP:
+            break;
+        case XLINK_RESET_RESP:
+            break;
+        default:
+            ASSERT_X_LINK(0);
+    }
+    return 0;
+}
+
+//adds a new event with parameters and returns event id
+int dispatcherEventSend(xLinkEvent_t *event)
+{
+    mvLog(MVLOG_DEBUG, "%s, size %d, streamId %d.\n", TypeToStr(event->header.type), event->header.size, event->header.streamId);
+    int rc = XLinkWrite(event->xLinkFD, &event->header, sizeof(event->header), USB_DATA_TIMEOUT);
+    if(rc < 0)
+    {
+        mvLog(MVLOG_ERROR,"Write failed header %d | event %s\n", rc, TypeToStr(event->header.type));
+        return rc;
+    }
+    if (event->header.type == XLINK_WRITE_REQ)
+    {
+        //write requested data
+        rc = XLinkWrite(event->xLinkFD, event->data,
+                          event->header.size, USB_DATA_TIMEOUT);
+
+        if(rc < 0) {
+            mvLog(MVLOG_ERROR,"Write failed event %d\n", rc);
+        }
+    }
+    // this function will send events to the remote node
+    return 0;
+}
+
+static xLinkState_t getXLinkState(xLinkDesc_t* link)
+{
+    ASSERT_X_LINK_R(link != NULL, XLINK_NOT_INIT);
+    mvLog(MVLOG_DEBUG,"%s() link %p link->peerState %d\n", __func__,link, link->peerState);
+    return link->peerState;
+}
+
+void dispatcherCloseLink(void*fd)
+{
+    xLinkDesc_t* link = getLink(fd);
+
+    if (!link) {
+        mvLog(MVLOG_WARN, "Dispatcher link is null");
+        return;
+    }
+
+    link->peerState = XLINK_NOT_INIT;
+    link->id = INVALID_LINK_ID;
+    link->fd = NULL;
+    link->nextUniqueStreamId = 0;
+
+    int index;
+    uint32_t release_size = 0;
+    streamDesc_t* stream;
+    for (index = 0; index < XLINK_MAX_STREAMS; index++)
+    {
+        stream = &link->availableStreams[index];
+        while (NULL != getPacketFromStream(stream))
+        {
+            releasePacketFromStream(stream, &release_size);
+        }
+        while (stream->blockedPackets != 0)
+        {
+            releasePacketFromStream(stream, &release_size);
+        }
+        if (stream->name[0] != '\0')
+        {
+            sem_destroy(&stream->sem); // ignore the error for some unused semaphore
+            stream->name[0] = '\0';
+        }
+        stream->id = INVALID_STREAM_ID;
+    }
+}
+
+void dispatcherCloseDeviceFd(void *fd)
+{
+    XLinkPlatformCloseRemote(fd);
+}
+
+
+/*#################################################################################
+###################################### EXTERNAL ###################################
+##################################################################################*/
+//Called only from app - per device
+XLinkError_t XLinkConnect(XLinkHandler_t* handler)
+{
+    ASSERT_X_LINK(handler);
+    if (strnlen(handler->devicePath, MAX_PATH_LENGTH) < 2) {
+        mvLog(MVLOG_ERROR, "Device path is incorrect");
+        return X_LINK_ERROR;
+    }
+
+    int index = getNextAvailableLinkIndex();
+    ASSERT_X_LINK(index != -1);
+
+    xLinkDesc_t* link = &availableXLinks[index];
+    mvLog(MVLOG_DEBUG,"%s() device name %s glHandler %p protocol %d\n", __func__, handler->devicePath, glHandler, glHandler->protocol);
+
+    if (XLinkPlatformConnect(handler->devicePath2, handler->devicePath, &link->fd) < 0) {
+        return X_LINK_ERROR;
+    }
+
+    if (dispatcherStart(link->fd))
+        return X_LINK_TIMEOUT;
+
+    xLinkEvent_t event = {0};
+    event.header.type = XLINK_PING_REQ;
+    event.xLinkFD = link->fd;
+    dispatcherAddEvent(EVENT_LOCAL, &event);
+
+    if (dispatcherWaitEventComplete(link->fd, glDeviceOpenTimeOutMsec)) {
+        dispatcherClean(link->fd);
+        return X_LINK_TIMEOUT;
+    }
+
+    link->id = getNextAvailableLinkUniqueId();
+    link->peerState = XLINK_UP;
+    handler->linkId = link->id;
+
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkInitialize(XLinkGlobalHandler_t* handler)
+{
+    ASSERT_X_LINK(XLINK_MAX_STREAMS <= MAX_POOLS_ALLOC);
+    glHandler = handler;
+    sem_init(&pingSem,0,0);
+    int i;
+
+#if (defined(_WIN32) || defined(_WIN64))
+    if (glHandler->protocol != USB_VSC && glHandler->protocol != PCIE) {
+        mvLog(MVLOG_ERROR, "Windows only support USB_VSC or PCIe!");
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+#endif
+
+    int sc = XLinkPlatformInit(glHandler->protocol, glHandler->loglevel);
+    if (sc != X_LINK_SUCCESS) {
+       return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+
+    // FIXME Get rid of this workaround 
+    XLinkProtocol_t protocol = glHandler->protocol;
+    memset((void*)handler, 0, sizeof(XLinkGlobalHandler_t));
+    handler->protocol = protocol;
+
+    //initialize availableStreams
+    xLinkDesc_t* link;
+    for (i = 0; i < MAX_LINKS; i++) {
+        link = &availableXLinks[i];
+        link->id = INVALID_LINK_ID;
+        link->fd = NULL;
+        link->peerState = XLINK_NOT_INIT;
+        int stream;
+        for (stream = 0; stream < XLINK_MAX_STREAMS; stream++)
+            link->availableStreams[stream].id = INVALID_STREAM_ID;
+    }
+
+    controlFunctionTbl.eventReceive = &dispatcherEventReceive;
+    controlFunctionTbl.eventSend = &dispatcherEventSend;
+    controlFunctionTbl.localGetResponse = &dispatcherLocalEventGetResponse;
+    controlFunctionTbl.remoteGetResponse = &dispatcherRemoteEventGetResponse;
+    controlFunctionTbl.closeLink = &dispatcherCloseLink;
+    controlFunctionTbl.closeDeviceFd = &dispatcherCloseDeviceFd;
+
+    if (dispatcherInitialize(&controlFunctionTbl))
+        return X_LINK_TIMEOUT;
+
+    return X_LINK_SUCCESS;
+}
+
+
+XLinkError_t XLinkGetFillLevel(streamId_t streamId, int isRemote, int* fillLevel)
+{
+    linkId_t id;
+    EXTRACT_IDS(streamId,id);
+    xLinkDesc_t* link = getLinkById(id);
+    streamDesc_t* stream;
+
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+    stream = getStreamById(link->fd, streamId);
+    ASSERT_X_LINK(stream);
+
+    if (isRemote)
+        *fillLevel = stream->remoteFillLevel;
+    else
+        *fillLevel = stream->localFillLevel;
+    releaseStream(stream);
+    return X_LINK_SUCCESS;
+}
+
+streamId_t XLinkOpenStream(linkId_t id, const char* name, int stream_write_size)
+{
+    xLinkEvent_t event = {0};
+    xLinkDesc_t* link = getLinkById(id);
+    mvLog(MVLOG_DEBUG,"%s() id %d link %p\n", __func__, id, link);
+    ASSERT_X_LINK_R(link != NULL, INVALID_STREAM_ID);
+    if (getXLinkState(link) != XLINK_UP) {
+        /*no link*/
+        mvLog(MVLOG_DEBUG,"%s() no link up\n", __func__);
+        return INVALID_STREAM_ID;
+    }
+
+    if(strlen(name) > MAX_NAME_LENGTH)
+    {
+        mvLog(MVLOG_WARN,"name too long\n");
+        return INVALID_STREAM_ID;
+    }
+
+    if(stream_write_size > 0)
+    {
+        stream_write_size = ALIGN_UP(stream_write_size, __CACHE_LINE_SIZE);
+        event.header.type = XLINK_CREATE_STREAM_REQ;
+        strncpy(event.header.streamName, name, MAX_NAME_LENGTH - 1);
+        event.header.streamName[MAX_NAME_LENGTH - 1] = '\0';
+        event.header.size = stream_write_size;
+        event.header.streamId = INVALID_STREAM_ID;
+        event.xLinkFD = link->fd;
+
+        dispatcherAddEvent(EVENT_LOCAL, &event);
+        if (dispatcherWaitEventComplete(link->fd, DEFAULT_TIMEOUT))
+            return INVALID_STREAM_ID;
+
+        XLinkError_t eventStatus = checkEventHeader(event.header);
+        if (eventStatus != X_LINK_SUCCESS) {
+            mvLog(MVLOG_ERROR, "Got wrong package from device, error code = %s", XLinkErrorToStr(eventStatus));
+            // FIXME: not good solution, but seems the only in the case of such XLink API
+            if (eventStatus == X_LINK_OUT_OF_MEMORY) {
+                return INVALID_STREAM_ID_OUT_OF_MEMORY;
+            } else {
+                return INVALID_STREAM_ID;
+            }
+        }
+    }
+    streamId_t streamId = getStreamIdByName(link, name);
+
+    if (streamId > 0x0FFFFFFF) {
+        mvLog(MVLOG_ERROR, "Cannot find stream id by the \"%s\" name", name);
+        mvLog(MVLOG_ERROR,"Max streamId reached!");
+        return INVALID_STREAM_ID;
+    }
+    COMBIN_IDS(streamId, id);
+    return streamId;
+}
+
+XLinkError_t checkEventHeader(xLinkEventHeader_t header) {
+    mvLog(MVLOG_DEBUG, "header.flags.bitField: ack:%u, nack:%u, sizeTooBig:%u, block:%u, bufferFull:%u, localServe:%u, noSuchStream:%u, terminate:%u",
+          header.flags.bitField.ack,
+          header.flags.bitField.nack,
+          header.flags.bitField.sizeTooBig,
+          header.flags.bitField.block,
+          header.flags.bitField.bufferFull,
+          header.flags.bitField.localServe,
+          header.flags.bitField.noSuchStream,
+          header.flags.bitField.terminate);
+
+
+    if (header.flags.bitField.ack) {
+        return X_LINK_SUCCESS;
+    } else if (header.flags.bitField.nack) {
+        return X_LINK_COMMUNICATION_FAIL;
+    } else if (header.flags.bitField.sizeTooBig) {
+        return X_LINK_OUT_OF_MEMORY;
+    } else {
+        return X_LINK_ERROR;
+    }
+}
+
+
+// Just like open stream, when closeStream is called
+// on the local size we are resetting the writeSize
+// and on the remote side we are freeing the read buffer
+XLinkError_t XLinkCloseStream(streamId_t streamId)
+{
+    linkId_t id;
+    EXTRACT_IDS(streamId,id);
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+
+    mvLog(MVLOG_DEBUG,"%s(): streamId %d\n", __func__, (int)streamId);
+    if (getXLinkState(link) != XLINK_UP)
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+
+    xLinkEvent_t event = {0};
+    event.header.type = XLINK_CLOSE_STREAM_REQ;
+    event.header.streamId = streamId;
+    event.xLinkFD = link->fd;
+    if (dispatcherAddEvent(EVENT_LOCAL, &event) == NULL) {
+        mvLog(MVLOG_ERROR, "Dispatcher failed on adding event");
+        return X_LINK_ERROR;
+    }
+
+    if (dispatcherWaitEventComplete(link->fd, DEFAULT_TIMEOUT))
+        return X_LINK_TIMEOUT;
+
+    if (event.header.flags.bitField.ack == 1)
+        return X_LINK_SUCCESS;
+    else
+        return X_LINK_COMMUNICATION_FAIL;
+
+    return X_LINK_SUCCESS;
+}
+
+
+XLinkError_t XLinkGetAvailableStreams(linkId_t id)
+{
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkGetDeviceName(int index, char* name, int nameSize, int pid)
+{
+    xLinkPlatformErrorCode_t rc;
+    rc = XLinkPlatformGetDeviceNameExtended(index, name, nameSize, pid);
+    return parseUsbLinkPlatformError(rc);
+}
+
+static XLinkError_t writeData(streamId_t streamId, const uint8_t* buffer,
+                            int size, unsigned int timeout)
+{
+    linkId_t id;
+    EXTRACT_IDS(streamId,id);
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+    struct timespec start, end;
+    clock_gettime(CLOCK_REALTIME, &start);
+
+    xLinkEvent_t event = {0};
+    event.header.type = XLINK_WRITE_REQ;
+    event.header.size = size;
+    event.header.streamId = streamId;
+    event.xLinkFD = link->fd;
+    event.data = (void*)buffer;
+
+    if (dispatcherAddEvent(EVENT_LOCAL, &event) == NULL) {
+        mvLog(MVLOG_ERROR, "Dispatcher failed on adding event");
+        return X_LINK_ERROR;
+    }
+    if (dispatcherWaitEventComplete(link->fd, timeout))
+        return X_LINK_TIMEOUT;
+
+    clock_gettime(CLOCK_REALTIME, &end);
+
+    if (event.header.flags.bitField.ack == 1)
+    {
+         //profile only on success
+        if( glHandler->profEnable)
+        {
+            glHandler->profilingData.totalWriteBytes += size;
+            glHandler->profilingData.totalWriteTime += timespec_diff(&start, &end);
+        }
+        return X_LINK_SUCCESS;
+    }
+    else
+        return X_LINK_COMMUNICATION_FAIL;
+}
+
+XLinkError_t XLinkWriteData(streamId_t streamId, const uint8_t* buffer,
+                            int size)
+{
+    return writeData(streamId, buffer, size, DEFAULT_TIMEOUT);
+}
+
+XLinkError_t XLinkWriteDataWithTimeout(streamId_t streamId, const uint8_t* buffer,
+                            int size, unsigned int timeout)
+{
+    return writeData(streamId, buffer, size, timeout);
+}
+
+
+XLinkError_t XLinkWriteGraphData(streamId_t streamId, const uint8_t* buffer, int size)
+{
+    return writeData(streamId, buffer, size, glAllocateGraphTimeOutMsec);
+}
+
+XLinkError_t XLinkAsyncWriteData()
+{
+    if (getXLinkState(NULL) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkReadData(streamId_t streamId, streamPacketDesc_t** packet)
+{
+    return XLinkReadDataWithTimeOut(streamId, packet, DEFAULT_TIMEOUT);
+}
+
+XLinkError_t XLinkReadDataWithTimeOut(streamId_t streamId, streamPacketDesc_t** packet, unsigned int timeout)
+{
+    linkId_t id;
+    EXTRACT_IDS(streamId,id);
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+
+    xLinkEvent_t event = {0};
+    struct timespec start, end;
+
+    event.header.type = XLINK_READ_REQ;
+    event.header.size = 0;
+    event.header.streamId = streamId;
+    event.xLinkFD = link->fd;
+    event.data = NULL;
+
+    clock_gettime(CLOCK_REALTIME, &start);
+
+    if (dispatcherAddEvent(EVENT_LOCAL, &event) == NULL) {
+        mvLog(MVLOG_ERROR, "Dispatcher failed on adding event");
+        return X_LINK_ERROR;
+    }
+    if (dispatcherWaitEventComplete(link->fd, timeout))
+        return X_LINK_TIMEOUT;
+
+    if (event.data == NULL) {
+        mvLog(MVLOG_ERROR, "Event data is invalid");
+        return X_LINK_ERROR;
+    }
+
+    *packet = (streamPacketDesc_t *)event.data;
+    clock_gettime(CLOCK_REALTIME, &end);
+
+    if (event.header.flags.bitField.ack == 1)
+    {
+        if( glHandler->profEnable)
+        {
+            glHandler->profilingData.totalReadBytes += (*packet)->length;
+            glHandler->profilingData.totalReadTime += timespec_diff(&start, &end);
+        }
+        return X_LINK_SUCCESS;
+    }
+    else
+        return X_LINK_COMMUNICATION_FAIL;
+}
+
+XLinkError_t XLinkReleaseData(streamId_t streamId)
+{
+    linkId_t id;
+    EXTRACT_IDS(streamId,id);
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+
+    xLinkEvent_t event = {0};
+    event.header.type = XLINK_READ_REL_REQ;
+    event.header.streamId = streamId;
+    event.xLinkFD = link->fd;
+
+    if (dispatcherAddEvent(EVENT_LOCAL, &event) == NULL) {
+        mvLog(MVLOG_ERROR, "Dispatcher failed on adding event");
+        return X_LINK_ERROR;
+    }
+    if (dispatcherWaitEventComplete(link->fd, DEFAULT_TIMEOUT))
+        return X_LINK_TIMEOUT;
+
+    if (event.header.flags.bitField.ack == 1)
+        return X_LINK_SUCCESS;
+    else
+        return X_LINK_COMMUNICATION_FAIL;
+}
+
+XLinkError_t XLinkBootRemote(const char* deviceName, const char* binaryPath)
+{
+    if (XLinkPlatformBootRemote(deviceName, binaryPath) == 0)
+        return X_LINK_SUCCESS;
+    else
+        return X_LINK_COMMUNICATION_FAIL;
+}
+
+XLinkError_t XLinkResetRemote(linkId_t id)
+{
+    xLinkDesc_t* link = getLinkById(id);
+    ASSERT_X_LINK(link != NULL);
+    if (getXLinkState(link) != XLINK_UP)
+    {
+        mvLog(MVLOG_WARN, "Link is down, close connection to device without reset");
+        XLinkPlatformCloseRemote(link->fd);
+        return X_LINK_COMMUNICATION_NOT_OPEN;
+    }
+
+    // Add event to reset device. After sending it, dispatcher will close fd link
+    xLinkEvent_t event = {0};
+    event.header.type = XLINK_RESET_REQ;
+    event.xLinkFD = link->fd;
+    mvLog(MVLOG_DEBUG,"sending reset remote event\n");
+    dispatcherAddEvent(EVENT_LOCAL, &event);
+    if (dispatcherWaitEventComplete(link->fd, DEFAULT_TIMEOUT))
+        return X_LINK_TIMEOUT;
+
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkResetAll()
+{
+#if defined(USE_PCIE) || defined(NO_BOOT)
+    mvLog(MVLOG_INFO, "Devices will not be restarted for this configuration (PCIE or NO_BOOT)");
+#else
+    int i;
+    for (i = 0; i < MAX_LINKS; i++) {
+        if (availableXLinks[i].id != INVALID_LINK_ID) {
+            xLinkDesc_t* link = &availableXLinks[i];
+            int stream;
+            for (stream = 0; stream < XLINK_MAX_STREAMS; stream++) {
+                if (link->availableStreams[stream].id != INVALID_STREAM_ID) {
+                    streamId_t streamId = link->availableStreams[stream].id;
+                    mvLog(MVLOG_DEBUG,"%s() Closing stream (stream = %d) %d on link %d\n",
+                          __func__, stream, (int) streamId, (int) link->id);
+                    COMBIN_IDS(streamId, link->id);
+                    if (XLinkCloseStream(streamId) != X_LINK_SUCCESS) {
+                        mvLog(MVLOG_WARN,"Failed to close stream");
+                    }
+                }
+            }
+            if (XLinkResetRemote(link->id) != X_LINK_SUCCESS) {
+                mvLog(MVLOG_WARN,"Failed to reset");
+            }
+        }
+    }
+#endif
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkProfStart()
+{
+    glHandler->profEnable = 1;
+    glHandler->profilingData.totalReadBytes = 0;
+    glHandler->profilingData.totalWriteBytes = 0;
+    glHandler->profilingData.totalWriteTime = 0;
+    glHandler->profilingData.totalReadTime = 0;
+    glHandler->profilingData.totalBootCount = 0;
+    glHandler->profilingData.totalBootTime = 0;
+
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkProfStop()
+{
+    glHandler->profEnable = 0;
+    return X_LINK_SUCCESS;
+}
+
+XLinkError_t XLinkProfPrint()
+{
+    printf("XLink profiling results:\n");
+    if (glHandler->profilingData.totalWriteTime)
+    {
+        printf("Average write speed: %f MB/Sec\n",
+               glHandler->profilingData.totalWriteBytes /
+               glHandler->profilingData.totalWriteTime /
+               1024.0 /
+               1024.0 );
+    }
+    if (glHandler->profilingData.totalReadTime)
+    {
+        printf("Average read speed: %f MB/Sec\n",
+               glHandler->profilingData.totalReadBytes /
+               glHandler->profilingData.totalReadTime /
+               1024.0 /
+               1024.0);
+    }
+    if (glHandler->profilingData.totalBootCount)
+    {
+        printf("Average boot speed: %f sec\n",
+               glHandler->profilingData.totalBootTime /
+               glHandler->profilingData.totalBootCount);
+    }
+    return X_LINK_SUCCESS;
+}
+/* end of file */
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLink.h b/inference-engine/thirdparty/movidius/XLink/shared/XLink.h
new file mode 100644 (file)
index 0000000..cd6b6bf
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+/// @brief     Application configuration Leon header
+///
+#ifndef _XLINK_H
+#define _XLINK_H
+#include "XLinkPublicDefines.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+// Set global common time out for all XLink operations.
+XLinkError_t XLinkSetCommonTimeOutMsec(unsigned int msec);
+
+// Set global device open time out for all XLink operations.
+XLinkError_t XLinkSetDeviceOpenTimeOutMsec(unsigned int msec);
+
+// Set global allocate graph time out for all XLink operations.
+XLinkError_t XLinkSetAllocateGraphTimeOutMsec(unsigned int msec);
+
+// Initializes XLink and scheduler
+XLinkError_t XLinkInitialize(XLinkGlobalHandler_t* handler);
+
+// Connects to specific device, starts dispatcher and pings remote
+XLinkError_t XLinkConnect(XLinkHandler_t* handler);
+
+// Opens a stream in the remote that can be written to by the local
+// Allocates stream_write_size (aligned up to 64 bytes) for that stream
+streamId_t XLinkOpenStream(linkId_t id, const char* name, int stream_write_size);
+
+// Close stream for any further data transfer
+// Stream will be deallocated when all pending data has been released
+XLinkError_t XLinkCloseStream(streamId_t streamId);
+
+// Currently useless
+XLinkError_t XLinkGetAvailableStreams(linkId_t id);
+
+/**
+ * @brief Return Myriad device name
+ * @param index Return device on index from suitable (matches pid argument) devices list
+ * @param pid   0x2485 for MX, 0x2150 for M2, 0 for any, -1 for any not booted 
+ */
+XLinkError_t XLinkGetDeviceName(int index, char* name, int nameSize, int pid);
+
+// Send a package to initiate the writing of data to a remote stream
+// Note that the actual size of the written data is ALIGN_UP(size, 64)
+XLinkError_t XLinkWriteData(streamId_t streamId, const uint8_t* buffer, int size);
+
+// Send a package to initiate the writing of data to a remote stream with specific timeout
+// Note that the actual size of the written data is ALIGN_UP(size, 64)
+XLinkError_t XLinkWriteDataWithTimeout(streamId_t streamId, const uint8_t* buffer, int size, unsigned int timeout);
+
+// Currently useless
+XLinkError_t XLinkAsyncWriteData();
+
+// Read data from local stream. Will only have something if it was written
+// to by the remote
+XLinkError_t XLinkReadData(streamId_t streamId, streamPacketDesc_t** packet);
+XLinkError_t XLinkReadDataWithTimeOut(streamId_t streamId, streamPacketDesc_t** packet, unsigned int timeout);
+
+// Release data from stream - This should be called after ReadData
+XLinkError_t XLinkReleaseData(streamId_t streamId);
+
+//Read fill level
+XLinkError_t XLinkGetFillLevel(streamId_t streamId, int isRemote, int* fillLevel);
+
+// Boot the remote (This is intended as an interface to boot the Myriad
+// from PC)
+XLinkError_t XLinkBootRemote(const char* deviceName, const char* binaryPath);
+
+// Reset the remote
+XLinkError_t XLinkResetRemote(linkId_t id);
+
+// Close all and release all memory
+XLinkError_t XLinkResetAll();
+
+// Profiling funcs - keeping them global for now
+XLinkError_t XLinkProfStart();
+XLinkError_t XLinkProfStop();
+XLinkError_t XLinkProfPrint();
+
+XLinkError_t XLinkWriteGraphData(streamId_t streamId, const uint8_t* buffer, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.c b/inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.c
new file mode 100644 (file)
index 0000000..c0920f0
--- /dev/null
@@ -0,0 +1,895 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+#include "stdio.h"
+#include "stdint.h"
+#include "stdlib.h"
+#include "string.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#if (defined(_WIN32) || defined(_WIN64))
+#include "win_pthread.h"
+#include "win_semaphore.h"
+#else
+#include <pthread.h>
+#include <semaphore.h>
+#endif
+#include "XLinkDispatcher.h"
+#include "XLinkPrivateDefines.h"
+#include "XLink.h"
+
+#define MVLOG_UNIT_NAME xLink
+#include "mvLog.h"
+
+typedef enum {
+    EVENT_ALLOCATED,
+    EVENT_PENDING,
+    EVENT_BLOCKED,
+    EVENT_READY,
+    EVENT_SERVED,
+} xLinkEventState_t;
+
+typedef struct xLinkEventPriv_t {
+    xLinkEvent_t packet;
+    xLinkEventState_t isServed;
+    xLinkEventOrigin_t origin;
+    sem_t* sem;
+    void* data;
+    xLinkEvent_t * retEv;
+    uint32_t pad;
+} xLinkEventPriv_t;
+
+typedef struct {
+    sem_t sem;
+    pthread_t threadId;
+    int refs;
+} localSem_t;
+
+typedef struct{
+    xLinkEventPriv_t* end;
+    xLinkEventPriv_t* base;
+
+    xLinkEventPriv_t* curProc;
+    xLinkEventPriv_t* cur;
+    __attribute__((aligned(64))) xLinkEventPriv_t q[MAX_EVENTS];
+
+}eventQueueHandler_t;
+/**
+ * @brief Scheduler for each device
+ */
+typedef struct {
+    void* xLinkFD; //will be device handler
+    int schedulerId;
+
+    sem_t addEventSem;
+    sem_t notifyDispatcherSem;
+    volatile uint32_t resetXLink;
+    uint32_t semaphores;
+    pthread_t xLinkThreadId;
+
+    eventQueueHandler_t lQueue; //local queue
+    eventQueueHandler_t rQueue; //remote queue
+    localSem_t eventSemaphores[MAXIMUM_SEMAPHORES];
+} xLinkSchedulerState_t;
+
+
+#define CIRCULAR_INCREMENT(x, maxVal, base) \
+    { \
+        x++; \
+        if (x == maxVal) \
+            x = base; \
+    }
+//avoid problems with unsigned. first compare and then give the nuw value
+#define CIRCULAR_DECREMENT(x, maxVal, base) \
+{ \
+    if (x == base) \
+        x = maxVal - 1; \
+    else \
+        x--; \
+}
+
+extern char* TypeToStr(int type);
+
+#if (defined(_WIN32) || defined(_WIN64))
+static void* __cdecl eventSchedulerRun(void* ctx);
+#else
+static void* eventSchedulerRun(void*);
+#endif
+//These will be common for all, Initialized only once
+struct dispatcherControlFunctions* glControlFunc;
+int numSchedulers;
+xLinkSchedulerState_t schedulerState[MAX_SCHEDULERS];
+sem_t addSchedulerSem;
+
+int pthread_t_compare(pthread_t a, pthread_t b)
+{
+#if (defined(_WIN32) || defined(_WIN64) )
+       return ((a.tid == b.tid));
+#else
+    return  (a == b);
+#endif
+}
+
+static int unrefSem(sem_t* sem,  xLinkSchedulerState_t* curr) {
+    ASSERT_X_LINK(curr != NULL);
+    localSem_t* temp = curr->eventSemaphores;
+    while (temp < curr->eventSemaphores + MAXIMUM_SEMAPHORES) {
+        if (&temp->sem == sem) {
+            temp->refs--;
+            if (temp->refs == 0) {
+                curr->semaphores--;
+                ASSERT_X_LINK(sem_destroy(&temp->sem) != -1);
+                temp->refs = -1;
+            }
+            return 1;
+        }
+        temp++;
+    }
+    mvLog(MVLOG_WARN,"unrefSem : sem wasn't found\n");
+    return 0;
+}
+static sem_t* getCurrentSem(pthread_t threadId, xLinkSchedulerState_t* curr, int inc_ref)
+{
+    ASSERT_X_LINK_R(curr != NULL, NULL);
+
+    localSem_t* sem = curr->eventSemaphores;
+    while (sem < curr->eventSemaphores + MAXIMUM_SEMAPHORES) {
+        if (pthread_t_compare(sem->threadId, threadId) && sem->refs > 0) {
+            sem->refs += inc_ref;
+            return &sem->sem;
+        }
+        sem++;
+    }
+    return NULL;
+}
+
+static sem_t* createSem(xLinkSchedulerState_t* curr)
+{
+    ASSERT_X_LINK_R(curr != NULL, NULL);
+
+
+    sem_t* sem = getCurrentSem(pthread_self(), curr, 0);
+    if (sem) // it already exists, error
+        return NULL;
+    else
+    {
+        if (curr->semaphores < MAXIMUM_SEMAPHORES) {
+            localSem_t* temp = curr->eventSemaphores;
+            while (temp < curr->eventSemaphores + MAXIMUM_SEMAPHORES) {
+                if (temp->refs < 0) {
+                    sem = &temp->sem;
+                    if (temp->refs == -1) {
+                        if (sem_init(sem, 0, 0))
+                            perror("Can't create semaphore\n");
+                    }
+                    curr->semaphores++;
+                    temp->refs = 1;
+                    temp->threadId = pthread_self();
+
+                    break;
+                }
+                temp++;
+            }
+            if (!sem)
+                return NULL;
+        }
+        else
+            return NULL;
+       return sem;
+    }
+}
+
+#if (defined(_WIN32) || defined(_WIN64))
+static void* __cdecl eventReader(void* ctx)
+#else
+static void* eventReader(void* ctx)
+#endif
+{
+    xLinkSchedulerState_t *curr = (xLinkSchedulerState_t*)ctx;
+    ASSERT_X_LINK_R(curr, NULL);
+
+    xLinkEvent_t event = { 0 };
+    event.header.id = -1;
+    event.xLinkFD = curr->xLinkFD;
+
+    mvLog(MVLOG_INFO,"eventReader started");
+
+    while (!curr->resetXLink) {
+        int sc = glControlFunc->eventReceive(&event);
+        mvLog(MVLOG_DEBUG,"Reading %s (scheduler %d, fd %p, event id %d, event stream_id %d, event size %d)\n",
+            TypeToStr(event.header.type), curr->schedulerId, event.xLinkFD, event.header.id, event.header.streamId, event.header.size);
+
+        if (event.header.type == XLINK_RESET_RESP) {
+            curr->resetXLink = 1;
+            mvLog(MVLOG_INFO,"eventReader stopped");
+            break;
+        }
+
+        if (sc) {
+            if (sem_post(&curr->notifyDispatcherSem)) {
+                mvLog(MVLOG_ERROR,"can't post semaphore\n"); // stop eventSchedulerRun thread
+            }
+            mvLog(MVLOG_ERROR,"eventReader stopped");
+            break;
+        }
+    }
+
+    return 0;
+}
+
+
+
+static int isEventTypeRequest(xLinkEventPriv_t* event)
+{
+    if (event->packet.header.type < XLINK_REQUEST_LAST)
+        return 1;
+    else
+        return 0;
+}
+
+static void markEventBlocked(xLinkEventPriv_t* event)
+{
+    event->isServed = EVENT_BLOCKED;
+}
+
+static void markEventReady(xLinkEventPriv_t* event)
+{
+    event->isServed = EVENT_READY;
+}
+
+static void markEventServed(xLinkEventPriv_t* event)
+{
+    if(event->retEv){
+        // the xLinkEventPriv_t slot pointed by "event" will be
+        // re-cycled as soon as we mark it as EVENT_SERVED,
+        // so before that, we copy the result event into XLink API layer
+        *(event->retEv) = event->packet;
+    }
+    if(event->sem){
+        if (sem_post(event->sem)) {
+            mvLog(MVLOG_ERROR,"can't post semaphore\n");
+        }
+    }
+    event->isServed = EVENT_SERVED;
+}
+
+
+static int dispatcherRequestServe(xLinkEventPriv_t * event, xLinkSchedulerState_t* curr){
+    ASSERT_X_LINK(curr != NULL);
+    ASSERT_X_LINK(isEventTypeRequest(event));
+    xLinkEventHeader_t *header = &event->packet.header;
+    if (header->flags.bitField.block){ //block is requested
+        markEventBlocked(event);
+    }else if(header->flags.bitField.localServe == 1 ||
+             (header->flags.bitField.ack == 0
+             && header->flags.bitField.nack == 1)){ //this event is served locally, or it is failed
+        markEventServed(event);
+    }else if (header->flags.bitField.ack == 1
+              && header->flags.bitField.nack == 0){
+        event->isServed = EVENT_PENDING;
+        mvLog(MVLOG_DEBUG,"------------------------UNserved %s\n",
+              TypeToStr(event->packet.header.type));
+    }else{
+        ASSERT_X_LINK(0);
+    }
+    return 0;
+}
+
+
+static int dispatcherResponseServe(xLinkEventPriv_t * event, xLinkSchedulerState_t* curr)
+{
+    int i = 0;
+    ASSERT_X_LINK(curr != NULL);
+    ASSERT_X_LINK(!isEventTypeRequest(event));
+    for (i = 0; i < MAX_EVENTS; i++)
+    {
+        xLinkEventHeader_t *header = &curr->lQueue.q[i].packet.header;
+        xLinkEventHeader_t *evHeader = &event->packet.header;
+
+        if (curr->lQueue.q[i].isServed == EVENT_PENDING &&
+                        header->id == evHeader->id &&
+                        header->type == evHeader->type - XLINK_REQUEST_LAST -1)
+        {
+            mvLog(MVLOG_DEBUG,"----------------------ISserved %s\n",
+                    TypeToStr(header->type));
+            //propagate back flags
+            header->flags = evHeader->flags;
+            markEventServed(&curr->lQueue.q[i]);
+            break;
+        }
+    }
+    if (i == MAX_EVENTS) {
+        mvLog(MVLOG_FATAL,"no request for this response: %s %d %d\n", TypeToStr(event->packet.header.type), event->origin, event->packet.header.id);
+        printf("#### (i == MAX_EVENTS) %s %d %d\n", TypeToStr(event->packet.header.type), event->origin, (int)event->packet.header.id);
+        for (i = 0; i < MAX_EVENTS; i++)
+        {
+            xLinkEventHeader_t *header = &curr->lQueue.q[i].packet.header;
+
+            printf("%d) header->id %i, header->type %s(%i), curr->lQueue.q[i].isServed %i, EVENT_PENDING %i\n", i, (int)header->id
+                     , TypeToStr(header->type), header->type, curr->lQueue.q[i].isServed, EVENT_PENDING);
+
+        }
+        ASSERT_X_LINK(0);
+    }
+    return 0;
+}
+
+static inline xLinkEventPriv_t* getNextElementWithState(xLinkEventPriv_t* base, xLinkEventPriv_t* end,
+                                                        xLinkEventPriv_t* start, xLinkEventState_t state){
+    xLinkEventPriv_t* tmp = start;
+    while (start->isServed != state){
+        CIRCULAR_INCREMENT(start, end, base);
+        if(tmp == start){
+            break;
+        }
+    }
+    if(start->isServed == state){
+        return start;
+    }else{
+        return NULL;
+    }
+}
+
+static xLinkEventPriv_t* searchForReadyEvent(xLinkSchedulerState_t* curr)
+{
+    ASSERT_X_LINK_R(curr != NULL, NULL);
+    xLinkEventPriv_t* ev = NULL;
+
+    ev = getNextElementWithState(curr->lQueue.base, curr->lQueue.end, curr->lQueue.base, EVENT_READY);
+    if(ev){
+        mvLog(MVLOG_DEBUG,"ready %s %d \n",
+              TypeToStr((int)ev->packet.header.type),
+              (int)ev->packet.header.id);
+    }
+    return ev;
+}
+
+static xLinkEventPriv_t* getNextQueueElemToProc(eventQueueHandler_t *q ){
+    xLinkEventPriv_t* event = NULL;
+    event = getNextElementWithState(q->base, q->end, q->curProc, EVENT_ALLOCATED);
+    if(event != NULL) {
+        q->curProc = event;
+        CIRCULAR_INCREMENT(q->curProc, q->end, q->base);
+    }
+    return event;
+}
+
+/**
+ * @brief Add event to Queue
+ * @note It called from dispatcherAddEvent
+ */
+static xLinkEvent_t* addNextQueueElemToProc(xLinkSchedulerState_t* curr,
+                                            eventQueueHandler_t *q, xLinkEvent_t* event,
+                                            sem_t* sem, xLinkEventOrigin_t o){
+    xLinkEvent_t* ev;
+    xLinkEventPriv_t* eventP = getNextElementWithState(q->base, q->end, q->cur, EVENT_SERVED);
+    if (eventP == NULL) {
+        mvLog(MVLOG_ERROR, "Can not get next element");
+        return NULL;
+    }
+    mvLog(MVLOG_DEBUG, "Received event %s %d", TypeToStr(event->header.type), o);
+    ev = &eventP->packet;
+    if (eventP->sem) {
+        if ((XLinkError_t)unrefSem(eventP->sem,  curr) == X_LINK_ERROR) {
+            mvLog(MVLOG_WARN, "Failed to unref sem");
+        }
+    }
+
+    eventP->sem = sem;
+    eventP->packet = *event;
+    eventP->origin = o;
+    if (o == EVENT_LOCAL) {
+        // XLink API caller provided buffer for return the final result to
+        eventP->retEv = event;
+    }else{
+        eventP->retEv = NULL;
+    }
+    // Mark eventP as ALLOCATED to prevent it from being allocated again
+    eventP->isServed = EVENT_ALLOCATED;
+    q->cur = eventP;
+    CIRCULAR_INCREMENT(q->cur, q->end, q->base);
+    return ev;
+}
+
+static xLinkEventPriv_t* dispatcherGetNextEvent(xLinkSchedulerState_t* curr)
+{
+    ASSERT_X_LINK_R(curr != NULL, NULL);
+
+    xLinkEventPriv_t* event = NULL;
+    event = searchForReadyEvent(curr);
+    if (event) {
+        return event;
+    }
+    if (XLinkWaitSem(&curr->notifyDispatcherSem)) {
+        mvLog(MVLOG_ERROR,"can't post semaphore\n");
+        return NULL;
+    }
+    event = getNextQueueElemToProc(&curr->lQueue);
+    if (event) {
+        return event;
+    }
+    event = getNextQueueElemToProc(&curr->rQueue);
+    return event;
+}
+
+static pthread_mutex_t reset_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static int isAvailableScheduler(xLinkSchedulerState_t* curr)
+{
+    if (curr->schedulerId == -1) {
+        mvLog(MVLOG_WARN,"Scheduler has already been reset or cleaned");
+        return 0; // resetted already
+    }
+    return 1;
+}
+
+static void closeDeviceFdAndResetScheduler(xLinkSchedulerState_t* curr)
+{
+
+    mvLog(MVLOG_INFO, "Dispatcher Cleaning...");
+    glControlFunc->closeDeviceFd(curr->xLinkFD);
+    curr->schedulerId = -1;
+    curr->resetXLink = 1;
+    sem_destroy(&curr->addEventSem);
+    sem_destroy(&curr->notifyDispatcherSem);
+    localSem_t* temp = curr->eventSemaphores;
+    while (temp < curr->eventSemaphores + MAXIMUM_SEMAPHORES) {
+        // unblock potentially blocked event semaphores
+        sem_post(&temp->sem);
+        sem_destroy(&temp->sem);
+        temp->refs = -1;
+        temp++;
+    }
+    numSchedulers--;
+    mvLog(MVLOG_INFO,"Cleaning Successfully\n");
+
+}
+
+
+static int dispatcherReset(xLinkSchedulerState_t* curr)
+{
+    ASSERT_X_LINK(curr != NULL);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&reset_mutex), 1);
+
+    if(!isAvailableScheduler(curr)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&reset_mutex));
+        return 1;
+    }
+
+    mvLog(MVLOG_INFO, "Resetting...");
+
+    glControlFunc->closeLink(curr->xLinkFD);
+
+    //notifyDispatcherSem +1 for NULL event, avoid dispatcher blocking.
+    if (sem_post(&curr->notifyDispatcherSem)) {
+        mvLog(MVLOG_ERROR,"can't post semaphore\n"); //to allow us to get a NULL event
+    }
+
+    xLinkEventPriv_t* event = dispatcherGetNextEvent(curr);
+    while (event != NULL) {
+        mvLog(MVLOG_INFO, "dropped event is %s, status %d\n",
+              TypeToStr(event->packet.header.type), event->isServed);
+        // although there is no no execution for this event, also mark it as being served without success
+        // caller will be informed and internal event memory slot will be de-allocated
+        markEventServed(event);
+        event = dispatcherGetNextEvent(curr);
+    }
+
+    event = getNextElementWithState(curr->lQueue.base, curr->lQueue.end, curr->lQueue.base, EVENT_PENDING);
+    while (event != NULL) {
+        mvLog(MVLOG_INFO,"Pending event is %s, size is %d, Mark it served\n", TypeToStr(event->packet.header.type), event->packet.header.size);
+        markEventServed(event);
+        event = getNextElementWithState(curr->lQueue.base, curr->lQueue.end, curr->lQueue.base, EVENT_PENDING);
+    }
+    closeDeviceFdAndResetScheduler(curr);
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&reset_mutex));
+    return 0;
+}
+#if (defined(_WIN32) || defined(_WIN64))
+static void* __cdecl eventSchedulerRun(void* ctx)
+#else
+static void* eventSchedulerRun(void* ctx)
+#endif
+{
+    int schedulerId = *((int*) ctx);
+    mvLog(MVLOG_DEBUG,"%s() schedulerId %d\n", __func__, schedulerId);
+    ASSERT_X_LINK_R(schedulerId < MAX_SCHEDULERS, NULL);
+
+    xLinkSchedulerState_t* curr = &schedulerState[schedulerId];
+    pthread_t readerThreadId;        /* Create thread for reader.
+                        This thread will notify the dispatcher of any incoming packets*/
+    pthread_attr_t attr;
+    int sc;
+    int res;
+    if (pthread_attr_init(&attr) !=0) {
+        mvLog(MVLOG_ERROR,"pthread_attr_init error");
+        return NULL;
+    }
+
+    sc = pthread_create(&readerThreadId, &attr, eventReader, curr);
+    if (sc) {
+        mvLog(MVLOG_ERROR, "Thread creation failed");
+        if (pthread_attr_destroy(&attr) != 0) {
+            perror("Thread attr destroy failed\n");
+        }
+        return NULL;
+    }
+    sc = pthread_attr_destroy(&attr);
+    if (sc) {
+        mvLog(MVLOG_WARN, "Thread attr destroy failed");
+    }
+    xLinkEventPriv_t* event;
+    xLinkEventPriv_t response;
+
+    mvLog(MVLOG_INFO,"Scheduler thread started");
+
+    while (!curr->resetXLink) {
+        event = dispatcherGetNextEvent(curr);
+        if (event == NULL) {
+            break;
+        }
+
+        ASSERT_X_LINK_R(event->packet.xLinkFD == curr->xLinkFD, NULL);
+        getRespFunction getResp;
+        xLinkEvent_t* toSend;
+
+        if (event->origin == EVENT_LOCAL){
+            getResp = glControlFunc->localGetResponse;
+            toSend = &event->packet;
+        }else{
+            getResp = glControlFunc->remoteGetResponse;
+            toSend = &response.packet;
+        }
+
+        res = getResp(&event->packet, &response.packet);
+        if (isEventTypeRequest(event)){
+            if (event->origin == EVENT_LOCAL){ //we need to do this for locals only
+                dispatcherRequestServe(event, curr);
+            }
+            // For PCIE and in with Connect to booted option don't send reset request
+
+            if (res == 0 && event->packet.header.flags.bitField.localServe == 0){
+                // FIXME We shouldn't send reset request for PCIE and with turned on "NO_BOOT" cmake option
+                //  Also, we can't just close evenReader thread, as WinPthread don't have suitable function for this emergency exit,
+                //  so, let's pretend that would be ping request, and then we can correctly close eventReader thread
+#if defined(USE_PCIE) || defined(NO_BOOT)
+                if (toSend->header.type == XLINK_RESET_REQ) {
+                    toSend->header.type = XLINK_PING_REQ;
+                    curr->resetXLink = 1;
+                    mvLog(MVLOG_INFO, "Request for reboot not sent");
+                }
+#endif
+                if (glControlFunc->eventSend(toSend) != 0) {
+                    mvLog(MVLOG_ERROR, "Event sending failed");
+                }
+            }
+        } else {
+            if (event->origin == EVENT_REMOTE){ // match remote response with the local request
+                dispatcherResponseServe(event, curr);
+            }
+        }
+
+        //TODO: dispatcher shouldn't know about this packet. Seems to be easily move-able to protocol
+        if (event->packet.header.type == XLINK_RESET_REQ) {
+            curr->resetXLink = 1;
+        }
+
+        // remote event is served in one round
+        if (event->origin == EVENT_REMOTE){
+            event->isServed = EVENT_SERVED;
+        }
+    }
+
+    sc = pthread_join(readerThreadId, NULL);
+    if (sc) {
+        mvLog(MVLOG_ERROR, "Waiting for thread failed");
+    }
+
+    if (dispatcherReset(curr) != 0) {
+        mvLog(MVLOG_WARN, "Failed to reset");
+    }
+
+    if (curr->resetXLink != 1) {
+        mvLog(MVLOG_ERROR,"Scheduler thread stopped");
+    } else {
+        mvLog(MVLOG_INFO,"Scheduler thread stopped");
+    }
+
+    return NULL;
+}
+
+static int createUniqueID()
+{
+    static int id = 0xa;
+    return id++;
+}
+
+static xLinkSchedulerState_t* findCorrespondingScheduler(void* xLinkFD)
+{
+    int i;
+    if (xLinkFD == NULL) { //in case of myriad there should be one scheduler
+        if (numSchedulers == 1)
+            return &schedulerState[0];
+        else
+            NULL;
+    }
+    for (i=0; i < MAX_SCHEDULERS; i++)
+        if (schedulerState[i].schedulerId != -1 &&
+            schedulerState[i].xLinkFD == xLinkFD)
+            return &schedulerState[i];
+
+    return NULL;
+}
+///////////////// External Interface //////////////////////////
+/*Adds a new event with parameters and returns event id*/
+xLinkEvent_t* dispatcherAddEvent(xLinkEventOrigin_t origin, xLinkEvent_t *event)
+{
+    xLinkSchedulerState_t* curr = findCorrespondingScheduler(event->xLinkFD);
+    ASSERT_X_LINK_R(curr != NULL, NULL);
+
+    if(curr->resetXLink) {
+        return NULL;
+    }
+
+    mvLog(MVLOG_DEBUG, "Receiving event %s %d\n", TypeToStr(event->header.type), origin);
+    if (XLinkWaitSem(&curr->addEventSem)) {
+        mvLog(MVLOG_ERROR,"can't wait semaphore\n");
+        return NULL;
+    }
+
+    sem_t *sem = NULL;
+    xLinkEvent_t* ev;
+    if (origin == EVENT_LOCAL) {
+        event->header.id = createUniqueID();
+        sem = getCurrentSem(pthread_self(), curr, 1);
+        if (!sem) {
+            sem = createSem(curr);
+        }
+        if (!sem) {
+            mvLog(MVLOG_WARN,"No more semaphores. Increase XLink or OS resources\n");
+            if (sem_post(&curr->addEventSem)) {
+                mvLog(MVLOG_ERROR,"can't post semaphore\n");
+            }
+            return NULL;
+        }
+        event->header.flags.raw = 0;
+        event->header.flags.bitField.ack = 1;
+        ev = addNextQueueElemToProc(curr, &curr->lQueue, event, sem, origin);
+    } else {
+        ev = addNextQueueElemToProc(curr, &curr->rQueue, event, NULL, origin);
+    }
+    if (sem_post(&curr->addEventSem)) {
+        mvLog(MVLOG_ERROR,"can't post semaphore\n");
+    }
+    if (sem_post(&curr->notifyDispatcherSem)) {
+        mvLog(MVLOG_ERROR, "can't post semaphore\n");
+    }
+    return ev;
+}
+
+int dispatcherWaitEventComplete(void* xLinkFD, unsigned int timeout)
+{
+    xLinkSchedulerState_t* curr = findCorrespondingScheduler(xLinkFD);
+    ASSERT_X_LINK(curr != NULL);
+
+    sem_t* id = getCurrentSem(pthread_self(), curr, 0);
+    if (id == NULL) {
+        return -1;
+    }
+
+    int rc = XLinkWaitSemUserMode(id, timeout);
+
+#if !defined(USE_PCIE)
+    if (rc) {
+        xLinkEvent_t event = {0};
+        event.header.type = XLINK_RESET_REQ;
+        event.xLinkFD = xLinkFD;
+        mvLog(MVLOG_ERROR,"waiting is timeout, sending reset remote event");
+        dispatcherAddEvent(EVENT_LOCAL, &event);
+        id = getCurrentSem(pthread_self(), curr, 0);
+        if (id == NULL || XLinkWaitSemUserMode(id, timeout)) {
+            dispatcherReset(curr);
+        }
+    }
+#endif
+
+    return rc;
+}
+
+int dispatcherUnblockEvent(eventId_t id, xLinkEventType_t type, streamId_t stream, void* xLinkFD)
+{
+    xLinkSchedulerState_t* curr = findCorrespondingScheduler(xLinkFD);
+    ASSERT_X_LINK(curr != NULL);
+
+    mvLog(MVLOG_DEBUG,"unblock\n");
+    xLinkEventPriv_t* blockedEvent;
+    for (blockedEvent = curr->lQueue.q;
+         blockedEvent < curr->lQueue.q + MAX_EVENTS;
+         blockedEvent++)
+    {
+        if (blockedEvent->isServed == EVENT_BLOCKED &&
+            ((blockedEvent->packet.header.id == id || id == -1)
+            && blockedEvent->packet.header.type == type
+            && blockedEvent->packet.header.streamId == stream))
+        {
+            mvLog(MVLOG_DEBUG,"unblocked**************** %d %s\n",
+                  (int)blockedEvent->packet.header.id,
+                  TypeToStr((int)blockedEvent->packet.header.type));
+            markEventReady(blockedEvent);
+            return 1;
+        } else {
+            mvLog(MVLOG_DEBUG,"%d %s\n",
+                  (int)blockedEvent->packet.header.id,
+                  TypeToStr((int)blockedEvent->packet.header.type));
+        }
+    }
+    return 0;
+}
+
+int findAvailableScheduler()
+{
+    int i;
+    for (i = 0; i < MAX_SCHEDULERS; i++)
+        if (schedulerState[i].schedulerId == -1)
+            return i;
+    return -1;
+}
+
+/**
+ * Initialize scheduler for device
+ */
+int dispatcherStart(void* fd)
+{
+    if (fd == NULL) {
+        mvLog(MVLOG_ERROR, "Invalid device filedescriptor");
+        return -1;
+    }
+
+    pthread_attr_t attr;
+    int eventIdx;
+    if (numSchedulers >= MAX_SCHEDULERS)
+    {
+        mvLog(MVLOG_ERROR,"Max number Schedulers reached!\n");
+        return -1;
+    }
+
+    int idx = findAvailableScheduler();
+    if (idx < 0) {
+        mvLog(MVLOG_ERROR,"Available sheduler not found");
+        return -1;
+    }
+
+    memset(&schedulerState[idx], 0, sizeof(xLinkSchedulerState_t));
+
+    schedulerState[idx].semaphores = 0;
+
+    schedulerState[idx].resetXLink = 0;
+    schedulerState[idx].xLinkFD = fd;
+    schedulerState[idx].schedulerId = idx;
+
+    schedulerState[idx].lQueue.cur = schedulerState[idx].lQueue.q;
+    schedulerState[idx].lQueue.curProc = schedulerState[idx].lQueue.q;
+    schedulerState[idx].lQueue.base = schedulerState[idx].lQueue.q;
+    schedulerState[idx].lQueue.end = &schedulerState[idx].lQueue.q[MAX_EVENTS];
+
+    schedulerState[idx].rQueue.cur = schedulerState[idx].rQueue.q;
+    schedulerState[idx].rQueue.curProc = schedulerState[idx].rQueue.q;
+    schedulerState[idx].rQueue.base = schedulerState[idx].rQueue.q;
+    schedulerState[idx].rQueue.end = &schedulerState[idx].rQueue.q[MAX_EVENTS];
+
+    for (eventIdx = 0 ; eventIdx < MAX_EVENTS; eventIdx++)
+    {
+        schedulerState[idx].rQueue.q[eventIdx].isServed = EVENT_SERVED;
+        schedulerState[idx].lQueue.q[eventIdx].isServed = EVENT_SERVED;
+    }
+
+    if (sem_init(&schedulerState[idx].addEventSem, 0, 1)) {
+        perror("Can't create semaphore\n");
+        return -1;
+    }
+    if (sem_init(&schedulerState[idx].notifyDispatcherSem, 0, 0)) {
+        perror("Can't create semaphore\n");
+        return -1;
+    }
+    localSem_t* temp = schedulerState[idx].eventSemaphores;
+    while (temp < schedulerState[idx].eventSemaphores + MAXIMUM_SEMAPHORES) {
+        temp->refs = -1;
+        temp++;
+    }
+    if (pthread_attr_init(&attr) != 0) {
+        mvLog(MVLOG_ERROR,"pthread_attr_init error");
+        return -1;
+    }
+
+    XLinkWaitSem(&addSchedulerSem);
+    mvLog(MVLOG_DEBUG,"%s() starting a new thread - schedulerId %d \n", __func__, idx);
+    int sc = pthread_create(&schedulerState[idx].xLinkThreadId,
+                            &attr,
+                            eventSchedulerRun,
+                            (void*)&schedulerState[idx].schedulerId);
+    if (sc) {
+        mvLog(MVLOG_ERROR,"Thread creation failed with error: %d", sc);
+        if (pthread_attr_destroy(&attr) != 0) {
+            perror("Thread attr destroy failed\n");
+        }
+        return -1;
+    }
+    pthread_detach(schedulerState[idx].xLinkThreadId);
+    numSchedulers++;
+
+    sc = pthread_attr_destroy(&attr);
+    if (sc) {
+        perror("Thread attr destroy failed");
+    }
+
+    sem_post(&addSchedulerSem);
+
+    return 0;
+}
+
+/**
+ * @brief Initialize dispatcher functions and reset all schedulers
+ */
+int dispatcherInitialize(struct dispatcherControlFunctions* controlFunc) {
+    int i;
+    if (!controlFunc ||
+        !controlFunc->eventReceive ||
+        !controlFunc->eventSend ||
+        !controlFunc->localGetResponse ||
+        !controlFunc->remoteGetResponse)
+    {
+        return -1;
+    }
+
+    glControlFunc = controlFunc;
+    if (sem_init(&addSchedulerSem, 0, 1)) {
+        perror("Can't create semaphore\n");
+    }
+    numSchedulers = 0;
+    for (i = 0; i < MAX_SCHEDULERS; i++){
+        schedulerState[i].schedulerId = -1;
+    }
+    return 0;
+}
+
+int dispatcherClean(void* xLinkFD)
+{
+    xLinkSchedulerState_t* curr = findCorrespondingScheduler(xLinkFD);
+    ASSERT_X_LINK(curr != NULL);
+
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&reset_mutex), 1);
+    if(!isAvailableScheduler(curr)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&reset_mutex));
+        return 1;
+    }
+    mvLog(MVLOG_INFO, "Start Clean Dispatcher...");
+    closeDeviceFdAndResetScheduler(curr);
+    mvLog(MVLOG_INFO, "Clean Dispatcher Successfully...");
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&reset_mutex));
+    return 0;
+}
+
+
+
+/* end of file */
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.h b/inference-engine/thirdparty/movidius/XLink/shared/XLinkDispatcher.h
new file mode 100644 (file)
index 0000000..ec61eee
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+#ifndef _XLINKDISPATCHER_H
+#define _XLINKDISPATCHER_H
+#define _XLINK_ENABLE_PRIVATE_INCLUDE_
+#include "XLinkPrivateDefines.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+typedef int (*getRespFunction) (xLinkEvent_t*,
+                xLinkEvent_t*);
+///Adds a new event with parameters and returns event.header.id
+xLinkEvent_t* dispatcherAddEvent(xLinkEventOrigin_t origin,
+                                    xLinkEvent_t *event);
+
+int dispatcherWaitEventComplete(void* xlinkFD, unsigned int timeout);
+int dispatcherUnblockEvent(eventId_t id,
+                            xLinkEventType_t type,
+                            streamId_t stream,
+                            void* xlinkFD);
+
+struct dispatcherControlFunctions {
+                                int (*eventSend) (xLinkEvent_t*);
+                                int (*eventReceive) (xLinkEvent_t*);
+                                getRespFunction localGetResponse;
+                                getRespFunction remoteGetResponse;
+                                void (*closeLink) (void* fd);
+                                void (*closeDeviceFd) (void* fd);
+                                };
+
+int dispatcherInitialize(struct dispatcherControlFunctions* controlFunc);
+int dispatcherStart(void* fd);
+int dispatcherClean(void* xLinkFD);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkPlatform.h b/inference-engine/thirdparty/movidius/XLink/shared/XLinkPlatform.h
new file mode 100644 (file)
index 0000000..287756d
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#ifndef _XLINK_LINKPLATFORM_H
+#define _XLINK_LINKPLATFORM_H
+#include <stdint.h>
+#include "XLinkPublicDefines.h"
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define MAX_POOLS_ALLOC 32
+#define PACKET_LENGTH (64*1024)
+
+#define MAX_LINKS 32
+
+int XLinkWrite(void* fd, void* data, int size, unsigned int timeout);
+int XLinkRead(void* fd, void* data, int size, unsigned int timeout);
+int XLinkPlatformConnect(const char* devPathRead,
+                           const char* devPathWrite, void** fd);
+int XLinkPlatformInit(XLinkProtocol_t protocol, int loglevel);
+
+/**
+ * @brief      Return Myriad device name on index
+ * @param[in]  index Index of device in list of all Myriad devices
+ * @param[out] name device name, which would be found
+ */
+int XLinkPlatformGetDeviceName(int index,
+                                char* name,
+                                int nameSize);
+
+/**
+ * @brief      Returning Myriad device suitable for the parameters
+ * @param[in]  index Device index in list of suitable (matches pid argument) devices
+ * @param[out] name device name, which would be found
+ * @param[in] pid  0x2485 for MX, 0x2150 for M2, 0 for any, -1 for any not booted
+ */
+int XLinkPlatformGetDeviceNameExtended(int index,
+                                char* name,
+                                int nameSize,
+                                int pid);
+
+int XLinkPlatformBootRemote(const char* deviceName,
+                                                       const char* binaryPath);
+int XLinkPlatformCloseRemote(void *fd);
+
+void* allocateData(uint32_t size, uint32_t alignment);
+void deallocateData(void* ptr,uint32_t size, uint32_t alignment);
+
+typedef enum xLinkPlatformErrorCode {
+    X_LINK_PLATFORM_SUCCESS = 0,
+    X_LINK_PLATFORM_DEVICE_NOT_FOUND = -1,
+    X_LINK_PLATFORM_ERROR = -2,
+    X_LINK_PLATFORM_TIMEOUT = -3,
+    X_LINK_PLATFORM_DRIVER_NOT_LOADED = -4
+} xLinkPlatformErrorCode_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/* end of include file */
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkPrivateDefines.h b/inference-engine/thirdparty/movidius/XLink/shared/XLinkPrivateDefines.h
new file mode 100644 (file)
index 0000000..594cbf3
--- /dev/null
@@ -0,0 +1,224 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+#ifndef _XLINKPRIVATEDEFINES_H
+#define _XLINKPRIVATEDEFINES_H
+
+#ifdef _XLINK_ENABLE_PRIVATE_INCLUDE_
+
+#include <stdint.h>
+#if (defined(_WIN32) || defined(_WIN64))
+#include "win_semaphore.h"
+#else
+#include <semaphore.h>
+#endif
+#include <XLinkPublicDefines.h>
+#include "XLinkPlatform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#define MAX_NAME_LENGTH 16
+
+#ifdef USE_USB_VSC
+#define HEADER_SIZE (64-12 -8)
+#else
+#define HEADER_SIZE (64-12 -8)
+#endif
+
+#define MAXIMUM_SEMAPHORES 32
+#define __CACHE_LINE_SIZE 64
+
+#ifdef NDEBUG  // Release configuration
+    #ifndef __PC__
+        #define ASSERT_X_LINK(x)   if(!(x)) { exit(EXIT_FAILURE); }
+        #define ASSERT_X_LINK_R(x, r) ASSERT_X_LINK(x)
+    #else
+        #define ASSERT_X_LINK(x)   if(!(x)) { return X_LINK_ERROR; }
+        #define ASSERT_X_LINK_R(x, r)   if(!(x)) { return r; }
+    #endif
+#else   // Debug configuration
+    #ifndef __PC__
+        #define ASSERT_X_LINK(x)   if(!(x)) { fprintf(stderr, "%s:%d:\n Assertion Failed: %s\n", __FILE__, __LINE__, #x); exit(EXIT_FAILURE); }
+        #define ASSERT_X_LINK_R(x, r) ASSERT_X_LINK(x)
+    #else
+        #define ASSERT_X_LINK(x)   if(!(x)) { fprintf(stderr, "%s:%d:\n Assertion Failed: %s\n", __FILE__, __LINE__, #x); return X_LINK_ERROR; }
+        #define ASSERT_X_LINK_R(x, r)   if(!(x)) { fprintf(stderr, "%s:%d:\n Assertion Failed: %s\n", __FILE__, __LINE__, #x); return r; }
+    #endif
+#endif //  NDEBUG
+
+#ifndef CHECK_MUTEX_SUCCESS
+#define CHECK_MUTEX_SUCCESS(call)  {                                \
+    int error;                                                      \
+    if ((error = (call))) {                                         \
+      mvLog(MVLOG_ERROR, "%s failed with error: %d", #call, error); \
+    }                                                               \
+}
+#endif  // CHECK_MUTEX_SUCCESS
+
+#ifndef CHECK_MUTEX_SUCCESS_RC
+#define CHECK_MUTEX_SUCCESS_RC(call, rc)  {                         \
+    int error;                                                      \
+    if ((error = (call))) {                                         \
+      mvLog(MVLOG_ERROR, "%s failed with error: %d", #call, error); \
+      return rc;                                                    \
+    }                                                               \
+}
+#endif  // CHECK_MUTEX_SUCCESS_RC
+
+typedef int32_t eventId_t;
+
+/**
+ * @brief State for xLinkDesc_t
+ */
+typedef enum {
+    XLINK_NOT_INIT,
+    XLINK_UP,
+    XLINK_DOWN,
+} xLinkState_t;
+
+/**
+ * @brief Streams opened to device
+ */
+typedef struct{
+    char name[MAX_NAME_LENGTH];
+    streamId_t id;
+    void* fd;
+    uint32_t writeSize;
+    uint32_t readSize;  /*No need of read buffer. It's on remote,
+    will read it directly to the requested buffer*/
+    streamPacketDesc_t packets[XLINK_MAX_PACKETS_PER_STREAM];
+    uint32_t availablePackets;
+    uint32_t blockedPackets;
+
+    uint32_t firstPacket;
+    uint32_t firstPacketUnused;
+    uint32_t firstPacketFree;
+    uint32_t remoteFillLevel;
+    uint32_t localFillLevel;
+    uint32_t remoteFillPacketLevel;
+
+    uint32_t closeStreamInitiated;
+
+    sem_t sem;
+} streamDesc_t;
+
+/**
+ * @brief XLink primitive for each device
+ */
+typedef struct xLinkDesc_t {
+    // Incremental number, doesn't get decremented.
+    int nextUniqueStreamId;
+    streamDesc_t availableStreams[XLINK_MAX_STREAMS];
+    xLinkState_t peerState;
+    void* fd;
+    linkId_t id;
+} xLinkDesc_t;
+
+
+//events which are coming from remote
+typedef enum
+{
+    /*USB-PCIE related events*/
+    XLINK_WRITE_REQ,
+    XLINK_READ_REQ,
+    XLINK_READ_REL_REQ,
+    XLINK_CREATE_STREAM_REQ,
+    XLINK_CLOSE_STREAM_REQ,
+    XLINK_PING_REQ,
+    XLINK_RESET_REQ,
+    XLINK_REQUEST_LAST,
+    //note that is important to separate request and response
+    XLINK_WRITE_RESP,
+    XLINK_READ_RESP,
+    XLINK_READ_REL_RESP,
+    XLINK_CREATE_STREAM_RESP,
+    XLINK_CLOSE_STREAM_RESP,
+    XLINK_PING_RESP,
+    XLINK_RESET_RESP,
+    XLINK_RESP_LAST,
+
+    /*IPC related events*/
+    IPC_WRITE_REQ,
+    IPC_READ_REQ,
+    IPC_CREATE_STREAM_REQ,
+    IPC_CLOSE_STREAM_REQ,
+    //
+    IPC_WRITE_RESP,
+    IPC_READ_RESP,
+    IPC_CREATE_STREAM_RESP,
+    IPC_CLOSE_STREAM_RESP,
+} xLinkEventType_t;
+
+typedef enum
+{
+    EVENT_LOCAL,
+    EVENT_REMOTE,
+} xLinkEventOrigin_t;
+
+#define MAX_EVENTS 64
+
+#define MAX_SCHEDULERS MAX_LINKS
+
+typedef struct xLinkEventHeader_t{
+    eventId_t           id;
+    xLinkEventType_t    type;
+    char                streamName[MAX_NAME_LENGTH];
+    streamId_t          streamId;
+    uint32_t            size;
+    union{
+        uint32_t raw;
+        struct{
+            uint32_t ack : 1;
+            uint32_t nack : 1;
+            uint32_t block : 1;
+            uint32_t localServe : 1;
+            uint32_t terminate : 1;
+            uint32_t bufferFull : 1;
+            uint32_t sizeTooBig : 1;
+            uint32_t noSuchStream : 1;
+        }bitField;
+    }flags;
+}xLinkEventHeader_t;
+
+typedef struct xLinkEvent_t {
+    xLinkEventHeader_t header;
+    void* xLinkFD;
+    void* data;
+}xLinkEvent_t;
+
+int XLinkWaitSem(sem_t* sem);
+
+int XLinkWaitSemUserMode(sem_t* sem, unsigned int timeout);
+
+const char* XLinkErrorToStr(XLinkError_t rc);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /*_XLINK_ENABLE_PRIVATE_INCLUDE_ end*/
+#endif
+
+/* end of include file */
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkPublicDefines.h b/inference-engine/thirdparty/movidius/XLink/shared/XLinkPublicDefines.h
new file mode 100644 (file)
index 0000000..3f330fc
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+#ifndef _XLINKPUBLICDEFINES_H
+#define _XLINKPUBLICDEFINES_H
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define XLINK_MAX_STREAMS 32
+#define XLINK_MAX_PACKETS_PER_STREAM 64
+
+typedef enum{
+    X_LINK_SUCCESS = 0,
+    X_LINK_ALREADY_OPEN,
+    X_LINK_COMMUNICATION_NOT_OPEN,
+    X_LINK_COMMUNICATION_FAIL,
+    X_LINK_COMMUNICATION_UNKNOWN_ERROR,
+    X_LINK_DEVICE_NOT_FOUND,
+    X_LINK_TIMEOUT,
+    X_LINK_ERROR,
+    X_LINK_OUT_OF_MEMORY
+} XLinkError_t;
+
+typedef enum{
+    USB_VSC = 0,
+    USB_CDC,
+    PCIE,
+    IPC,
+    NMB_OF_PROTOCOLS
+} XLinkProtocol_t;
+
+#define USB_LINK_INVALID_FD  (-314)
+
+#define INVALID_STREAM_ID 0xDEADDEAD
+#define INVALID_STREAM_ID_OUT_OF_MEMORY 0xDEADFFFF
+#define INVALID_LINK_ID   0xFF
+
+typedef uint32_t streamId_t;
+typedef uint8_t linkId_t;
+
+
+typedef struct streamPacketDesc_t
+{
+    uint8_t* data;
+    uint32_t length;
+
+} streamPacketDesc_t;
+
+typedef struct XLinkProf_t
+{
+    float totalReadTime;
+    float totalWriteTime;
+    unsigned long totalReadBytes;
+    unsigned long totalWriteBytes;
+    unsigned long totalBootCount;
+    float totalBootTime;
+} XLinkProf_t;
+
+typedef struct XLinkGlobalHandler_t
+{
+    int loglevel;
+    int profEnable;
+    XLinkProtocol_t protocol;
+    XLinkProf_t profilingData;
+} XLinkGlobalHandler_t;
+
+typedef struct
+{
+    char* devicePath;
+    char* devicePath2;
+    linkId_t linkId;
+} XLinkHandler_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/* end of include file */
diff --git a/inference-engine/thirdparty/movidius/XLink/shared/XLinkVersion.h b/inference-engine/thirdparty/movidius/XLink/shared/XLinkVersion.h
new file mode 100644 (file)
index 0000000..cf87d97
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+///
+/// @file
+///
+/// @brief     Application configuration Leon header
+///
+
+#define X_LINK_VERSION_MAJOR 1
+#define X_LINK_VERSION_MINOR 0
+#define X_LINK_VERSION_PATCH 0
diff --git a/inference-engine/thirdparty/movidius/mvnc/CMakeLists.txt b/inference-engine/thirdparty/movidius/mvnc/CMakeLists.txt
new file mode 100644 (file)
index 0000000..df0e0e0
--- /dev/null
@@ -0,0 +1,110 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "mvnc")
+
+if(NOT WIN32)
+    find_package(Threads REQUIRED)
+
+    find_path(LIBUSB_INCLUDE_DIR NAMES libusb.h PATH_SUFFIXES "include" "libusb" "libusb-1.0")
+    find_library(LIBUSB_LIBRARY NAMES usb-1.0 PATH_SUFFIXES "lib")
+
+    if(NOT LIBUSB_INCLUDE_DIR OR NOT LIBUSB_LIBRARY)
+        message(FATAL_ERROR "libusb it required for Myriad plugin build")
+    endif()
+endif()
+
+file(GLOB MVNC_SOURCES "include/*" "src/*")
+
+# Watchdog would  be disabled on PCIE configuration
+if (NOT ENABLE_MYX_PCIE)
+    file(GLOB WATCHDOG_SOURCES "../watchdog/*")
+endif()
+
+# FIXME: WIN_PTHREAD also should be built as a library
+if(WIN32)
+    file(GLOB USB_WIN_SOURCES "../USB_WIN/*")
+    file(GLOB WIN_PTHREAD_SOURCES "../WinPthread/*")
+    list(APPEND ${MVNC_SOURCES} ${USB_WIN_SOURCES} ${WIN_PTHREAD_SOURCES})
+endif()
+
+add_library(${TARGET_NAME} STATIC ${MVNC_SOURCES} ${WATCHDOG_SOURCES})
+
+target_include_directories(${TARGET_NAME}
+    PUBLIC
+        "include"
+    PRIVATE
+        "../watchdog")
+
+if(WIN32)
+    target_include_directories(${TARGET_NAME}
+            PRIVATE
+            "../USB_WIN"
+            "../WinPthread")
+endif()
+
+if(UNIX)
+    target_include_directories(${TARGET_NAME}
+        PRIVATE
+            "${LIBUSB_INCLUDE_DIR}")
+endif()
+
+target_compile_definitions(${TARGET_NAME}
+    PRIVATE
+        __PC__
+        HAVE_STRUCT_TIMESPEC
+        _CRT_SECURE_NO_WARNINGS)
+
+if (ENABLE_MYX_PCIE)
+    target_compile_definitions(${TARGET_NAME} PRIVATE USE_PCIE)
+else ()
+    target_compile_definitions(${TARGET_NAME} PRIVATE USE_USB_VSC)
+endif()
+
+if (ENABLE_MYRIAD_NO_BOOT)
+    target_compile_definitions(${TARGET_NAME} PRIVATE NO_BOOT)
+endif()
+
+if(NOT WIN32)
+    target_compile_options(${TARGET_NAME}
+        PRIVATE
+            -MMD
+            -MP
+            -Wformat
+            -Wformat-security
+            -Wall)
+    if(CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.8)
+        target_compile_options(${TARGET_NAME}
+            PRIVATE
+                -fstack-protector-strong)
+    else()
+        target_compile_options(${TARGET_NAME}
+            PRIVATE
+                -fstack-protector)
+    endif()
+
+    set_property(TARGET ${TARGET_NAME}
+        PROPERTY LINK_FLAGS
+            -z noexecstack
+            -z relro
+            -z now)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    POSITION_INDEPENDENT_CODE TRUE
+    COMPILE_PDB_NAME ${TARGET_NAME})
+
+
+target_link_libraries(${TARGET_NAME}
+        PRIVATE
+            XLink)
+
+if(NOT WIN32)
+    target_link_libraries(${TARGET_NAME}
+        PUBLIC
+            Threads::Threads
+            ${CMAKE_DL_LIBS}
+            ${LIBUSB_LIBRARY})
+endif()
+
diff --git a/inference-engine/thirdparty/movidius/mvnc/include/mvnc.h b/inference-engine/thirdparty/movidius/mvnc/include/mvnc.h
new file mode 100644 (file)
index 0000000..5d4622c
--- /dev/null
@@ -0,0 +1,308 @@
+#ifndef __NC_H_INCLUDED__
+#define __NC_H_INCLUDED__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define NC_MAX_NAME_SIZE 28
+#define NC_THERMAL_BUFFER_SIZE 100
+#define NC_DEBUG_BUFFER_SIZE   120
+
+#define NOMINMAX
+#define MVNC_EXPORT_API
+
+typedef enum {
+    NC_OK = 0,
+    NC_BUSY = -1,                     // Device is busy, retry later
+    NC_ERROR = -2,                    // Error communicating with the device
+    NC_OUT_OF_MEMORY = -3,            // Out of memory
+    NC_DEVICE_NOT_FOUND = -4,         // No device at the given index or name
+    NC_INVALID_PARAMETERS = -5,       // At least one of the given parameters is wrong
+    NC_TIMEOUT = -6,                  // Timeout in the communication with the device
+    NC_MVCMD_NOT_FOUND = -7,          // The file to boot Myriad was not found
+    NC_NOT_ALLOCATED = -8,            // The graph or device has been closed during the operation
+    NC_UNAUTHORIZED = -9,             // Unauthorized operation
+    NC_UNSUPPORTED_GRAPH_FILE = -10,  // The graph file version is not supported
+    NC_UNSUPPORTED_CONFIGURATION_FILE = -11, // The configuration file version is not supported
+    NC_UNSUPPORTED_FEATURE = -12,     // Not supported by this FW version
+    NC_MYRIAD_ERROR = -13,            // An error has been reported by the device
+                                      // use  NC_DEVICE_DEBUG_INFO or NC_GRAPH_DEBUG_INFO
+    NC_INVALID_DATA_LENGTH = -14,      // invalid data length has been passed when get/set option
+    NC_INVALID_HANDLE = -15           // handle to object that is invalid
+} ncStatus_t;
+
+typedef enum {
+    NC_LOG_DEBUG = 0,   // debug and above (full verbosity)
+    NC_LOG_INFO,        // info and above
+    NC_LOG_WARN,        // warning and above
+    NC_LOG_ERROR,       // errors and above
+    NC_LOG_FATAL,       // fatal only
+} ncLogLevel_t;
+
+typedef enum {
+    NC_RW_LOG_LEVEL = 0,    // Log level, int, default NC_LOG_WARN
+    NC_RO_API_VERSION = 1,  // retruns API Version. array of unsigned int of size 4
+                            //major.minor.hotfix.rc
+    NC_RW_COMMON_TIMEOUT_MSEC = 2,
+    NC_RW_DEVICE_OPEN_TIMEOUT_MSEC = 3,
+    NC_RW_ALLOC_GRAPH_TIMEOUT_MSEC = 4,
+    NC_RW_RESET_ALL = 9000,     // resetAll on initialize
+} ncGlobalOption_t;
+
+typedef enum {
+    NC_RO_GRAPH_STATE = 1000,           // Returns graph state: CREATED, ALLOCATED, WAITING_FOR_BUFFERS, RUNNING, DESTROYED
+    NC_RO_GRAPH_TIME_TAKEN = 1001,      // Return time taken for last inference (float *)
+    NC_RO_GRAPH_INPUT_COUNT = 1002,     // Returns number of inputs, size of array returned
+                                        // by NC_RO_INPUT_TENSOR_DESCRIPTORS, int
+    NC_RO_GRAPH_OUTPUT_COUNT = 1003,    // Returns number of outputs, size of array returned
+                                        // by NC_RO_OUTPUT_TENSOR_DESCRIPTORS,int
+    NC_RO_GRAPH_INPUT_TENSOR_DESCRIPTORS = 1004,  // Return a tensorDescriptor pointer array
+                                            // which describes the graph inputs in order.
+                                            // Can be used for fifo creation.
+                                            // The length of the array can be retrieved
+                                            // using the NC_RO_INPUT_COUNT option
+
+    NC_RO_GRAPH_OUTPUT_TENSOR_DESCRIPTORS = 1005, // Return a tensorDescriptor pointer
+                                            // array which describes the graph
+                                            // outputs in order. Can be used for
+                                            // fifo creation. The length of the
+                                            // array can be retrieved using the
+                                            // NC_RO_OUTPUT_COUNT option
+
+    NC_RO_GRAPH_DEBUG_INFO = 1006,          // Return debug info, string
+    NC_RO_GRAPH_NAME = 1007,                // Returns name of the graph, string
+    NC_RO_GRAPH_OPTION_CLASS_LIMIT = 1008,  // return the highest option class supported
+    NC_RO_GRAPH_VERSION = 1009,             // returns graph version, string
+    NC_RO_GRAPH_TIME_TAKEN_ARRAY_SIZE = 1011, // Return size of array for time taken option, int
+    NC_RO_GRAPH_BATCH_SIZE = 1012,           // returns batch size of loaded graph
+    NC_RW_GRAPH_EXECUTORS_NUM = 1110,
+} ncGraphOption_t;
+
+typedef enum {
+    NC_DEVICE_OPENED = 0,
+    NC_DEVICE_CLOSED = 1,
+    NC_DEVICE_FAILED = 2,
+    NC_DEVICE_RESETED = 3,
+} ncDeviceState_t;
+
+typedef enum {
+    NC_GRAPH_CREATED = 0,
+    NC_GRAPH_ALLOCATED = 1,
+    NC_GRAPH_WAITING_FOR_BUFFERS = 2,
+    NC_GRAPH_RUNNING = 3,
+    NC_GRAPH_DEALLOCATED = 4,
+} ncGraphState_t;
+
+typedef enum {
+    NC_FIFO_CREATED = 0,
+    NC_FIFO_ALLOCATED = 1,
+    NC_FIFO_DESTROYED = 2,
+    NC_FIFO_FAILED = 3,
+    NC_FIFO_DEALLOCATED = 4
+} ncFifoState_t;
+
+typedef enum {
+    NC_MA2450 = 0,
+    NC_MA2480 = 1,
+} ncDeviceHwVersion_t;
+
+typedef enum {
+    NC_RO_DEVICE_THERMAL_STATS = 2000,          // Return temperatures, float *, not for general use
+    NC_RO_DEVICE_THERMAL_THROTTLING_LEVEL = 2001,   // 1=TEMP_LIM_LOWER reached, 2=TEMP_LIM_HIGHER reached
+    NC_RO_DEVICE_STATE = 2002,                  // Returns device state: CREATED, OPENED, CLOSED, DESTROYED
+    NC_RO_DEVICE_CURRENT_MEMORY_USED = 2003,    // Returns current device memory usage
+    NC_RO_DEVICE_MEMORY_SIZE = 2004,            // Returns device memory size
+    NC_RO_DEVICE_MAX_FIFO_NUM = 2005,           // return the maximum number of fifos supported
+    NC_RO_DEVICE_ALLOCATED_FIFO_NUM = 2006,     // return the number of currently allocated fifos
+    NC_RO_DEVICE_MAX_GRAPH_NUM = 2007,          // return the maximum number of graphs supported
+    NC_RO_DEVICE_ALLOCATED_GRAPH_NUM = 2008,    //  return the number of currently allocated graphs
+    NC_RO_DEVICE_OPTION_CLASS_LIMIT = 2009,     //  return the highest option class supported
+    NC_RO_DEVICE_FW_VERSION = 2010,             // return device firmware version, array of unsigned int of size 4
+                                                //major.minor.hwtype.buildnumber
+    NC_RO_DEVICE_DEBUG_INFO = 2011,             // Return debug info, string, not supported yet
+    NC_RO_DEVICE_MVTENSOR_VERSION = 2012,       // returns mv tensor version, array of unsigned int of size 2
+                                                //major.minor
+    NC_RO_DEVICE_NAME = 2013,                   // returns device name as generated internally
+    NC_RO_DEVICE_MAX_EXECUTORS_NUM = 2014,      //Maximum number of executers per graph
+    NC_RO_DEVICE_HW_VERSION = 2015,             //returns HW Version, enum
+    NC_RO_DEVICE_ID = 2016,                     // returns device id
+    NC_RO_DEVICE_PLATFORM = 2017,               // returns device platform (MyriadX, Myriad2)
+} ncDeviceOption_t;
+
+typedef enum {
+    UNKNOWN_PLATFORM = 0,
+    MYRIAD_2 = 2450,
+    MYRIAD_X = 2480,
+} ncDevicePlatform_t;
+
+
+typedef struct _devicePrivate_t devicePrivate_t;
+typedef struct _graphPrivate_t graphPrivate_t;
+typedef struct _fifoPrivate_t fifoPrivate_t;
+typedef struct _ncTensorDescriptorPrivate_t ncTensorDescriptorPrivate_t;
+
+struct ncFifoHandle_t {
+    // keep place for public data here
+    fifoPrivate_t* private_data;
+};
+
+struct ncGraphHandle_t {
+    // keep place for public data here
+    graphPrivate_t* private_data;
+};
+
+struct ncDeviceHandle_t {
+    // keep place for public data here
+    devicePrivate_t* private_data;
+};
+
+typedef enum {
+    NC_FIFO_HOST_RO = 0, // fifo can be read through the API but can not be
+                         // written ( graphs can read and write data )
+    NC_FIFO_HOST_WO = 1, // fifo can be written through the API but can not be
+                         // read (graphs can read but can not write)
+} ncFifoType_t;
+
+typedef enum {
+    NC_FIFO_FP16 = 0,
+    NC_FIFO_FP32 = 1,
+} ncFifoDataType_t;
+
+struct ncTensorDescriptor_t {
+    unsigned int n;         // batch size, currently only 1 is supported
+    unsigned int c;         // number of channels
+    unsigned int w;         // width
+    unsigned int h;         // height
+    unsigned int totalSize; // Total size of the data in tensor = largest stride* dim size
+    unsigned int cStride;   // Stride in the channels' dimension
+    unsigned int wStride;   // Stride in the horizontal dimension
+    unsigned int hStride;   // Stride in the vertical dimension
+    ncFifoDataType_t dataType;  // data type of the tensor, FP32 or FP16
+};
+
+typedef enum {
+    NC_RW_FIFO_TYPE = 0,            // configure the fifo type to one type from ncFifoType_t
+    NC_RW_FIFO_CONSUMER_COUNT = 1,  // The number of consumers of elements
+                                    // (the number of times data must be read by
+                                    // a graph or host before the element is removed.
+                                    // Defaults to 1. Host can read only once always.
+    NC_RW_FIFO_DATA_TYPE = 2,       // 0 for fp16, 1 for fp32. If configured to fp32,
+                                    // the API will convert the data to the internal
+                                    // fp16 format automatically
+    NC_RW_FIFO_DONT_BLOCK = 3,      // WriteTensor will return NC_OUT_OF_MEMORY instead
+                                    // of blocking, GetResult will return NO_DATA, not supported yet
+    NC_RO_FIFO_CAPACITY = 4,        // return number of maximum elements in the buffer
+    NC_RO_FIFO_READ_FILL_LEVEL = 5,     // return number of tensors in the read buffer
+    NC_RO_FIFO_WRITE_FILL_LEVEL = 6,    // return number of tensors in a write buffer
+    NC_RO_FIFO_GRAPH_TENSOR_DESCRIPTOR = 7,   // return the tensor descriptor of the FIFO
+    NC_RO_FIFO_TENSOR_DESCRIPTOR = NC_RO_FIFO_GRAPH_TENSOR_DESCRIPTOR,   // deprecated
+    NC_RO_FIFO_STATE = 8,               // return the fifo state, returns CREATED, ALLOCATED,DESTROYED
+    NC_RO_FIFO_NAME = 9,                // return fifo name
+    NC_RO_FIFO_ELEMENT_DATA_SIZE = 10,  //element data size in bytes, int
+    NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR = 11,  // App's tensor descriptor, defaults to none strided channel minor
+} ncFifoOption_t;
+
+typedef enum {
+    NC_DEBUG_INFO_SIZE = 0,
+    NC_TIMETAKEN_SIZE = 1,
+    NC_THERMAL_SIZE = 2,
+    NC_NINTPUT_SIZE = 3,
+    NC_NOUTPUT_SIZE = 4,
+    NC_BATCH_SIZE = 5,
+} ncUserGetInfo_t;
+
+
+
+// Global
+MVNC_EXPORT_API ncStatus_t ncGlobalSetOption(ncGlobalOption_t option, const void *data,
+                             unsigned int dataLength);
+MVNC_EXPORT_API ncStatus_t ncGlobalGetOption(ncGlobalOption_t option, void *data, unsigned int *dataLength);
+
+// Device
+MVNC_EXPORT_API ncStatus_t ncDeviceSetOption(struct ncDeviceHandle_t *deviceHandle,
+                             ncDeviceOption_t option, const void *data,
+                             unsigned int dataLength);
+MVNC_EXPORT_API ncStatus_t ncDeviceGetOption(struct ncDeviceHandle_t *deviceHandle,
+                             ncDeviceOption_t option, void *data, unsigned int *dataLength);
+
+/**
+ * @brief Create handle and open any free device
+ * @param platform MYRIAD_2, MYRIAD_X or UNKNOWN_PLATFORM for any device
+ * @param watchdogInterval Time interval to ping device in milliseconds. 0 to disable watchdog.
+ * @param customFirmwareDirectory Custom path to directory with firmware.
+ *          If NULL or empty, default path searching behavior will be used.
+ */
+MVNC_EXPORT_API ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
+        ncDevicePlatform_t platform, int watchdogInterval,
+        const char* customFirmwareDirectory);
+/**
+ * @brief Close device and destroy handler
+ */
+MVNC_EXPORT_API ncStatus_t ncDeviceClose(struct ncDeviceHandle_t **deviceHandle);
+
+// Graph
+MVNC_EXPORT_API ncStatus_t ncGraphCreate(const char* name, struct ncGraphHandle_t **graphHandle);
+MVNC_EXPORT_API ncStatus_t ncGraphAllocate(struct ncDeviceHandle_t *deviceHandle,
+                           struct ncGraphHandle_t *graphHandle,
+                           const void *graphBuffer, unsigned int graphBufferLength,
+                           const void *graphHeader, unsigned int graphHeaderLength);
+MVNC_EXPORT_API ncStatus_t ncGraphDestroy(struct ncGraphHandle_t **graphHandle);
+MVNC_EXPORT_API ncStatus_t ncGraphSetOption(struct ncGraphHandle_t *graphHandle,
+                            int option, const void *data, unsigned int dataLength);
+MVNC_EXPORT_API ncStatus_t ncGraphGetOption(struct ncGraphHandle_t *graphHandle,
+                            int option, void *data,
+                            unsigned int *dataLength);
+MVNC_EXPORT_API ncStatus_t ncGraphQueueInference(struct ncGraphHandle_t *graphHandle,
+                            struct ncFifoHandle_t** fifoIn, unsigned int inFifoCount,
+                            struct ncFifoHandle_t** fifoOut, unsigned int outFifoCount);
+
+//Helper functions
+MVNC_EXPORT_API ncStatus_t ncGraphQueueInferenceWithFifoElem(struct ncGraphHandle_t *graphHandle,
+                        struct ncFifoHandle_t* fifoIn,
+                        struct ncFifoHandle_t* fifoOut, const void *inputTensor,
+                        unsigned int * inputTensorLength, void *userParam);
+MVNC_EXPORT_API ncStatus_t ncGraphAllocateWithFifos(struct ncDeviceHandle_t* deviceHandle,
+                        struct ncGraphHandle_t* graphHandle,
+                        const void *graphBuffer, unsigned int graphBufferLength,
+                        const void *graphHeader, unsigned int graphHeaderLength,
+                        struct ncFifoHandle_t ** inFifoHandle,
+                        struct ncFifoHandle_t ** outFifoHandle);
+
+/*
+ * @outNumElem A unused param, we get output size from the graph
+ */
+MVNC_EXPORT_API ncStatus_t ncGraphAllocateWithFifosEx(struct ncDeviceHandle_t* deviceHandle,
+    struct ncGraphHandle_t* graphHandle,
+    const void *graphBuffer, unsigned int graphBufferLength,
+    const void *graphHeader, unsigned int graphHeaderLength,
+    struct ncFifoHandle_t ** inFifoHandle, ncFifoType_t inFifoType,
+    unsigned int inNumElem, ncFifoDataType_t inDataType,
+    struct ncFifoHandle_t ** outFifoHandle,  ncFifoType_t outFifoType,
+    unsigned int outNumElem, ncFifoDataType_t outDataType);
+// Fifo
+MVNC_EXPORT_API ncStatus_t ncFifoCreate(const char *name, ncFifoType_t type,
+                        struct ncFifoHandle_t **fifoHandle);
+MVNC_EXPORT_API ncStatus_t ncFifoAllocate(struct ncFifoHandle_t* fifoHandle,
+                        struct ncDeviceHandle_t* device,
+                        struct ncTensorDescriptor_t* tensorDesc,
+                        unsigned int numElem);
+MVNC_EXPORT_API ncStatus_t ncFifoSetOption(struct ncFifoHandle_t* fifoHandle, int option,
+                        const void *data, unsigned int dataLength);
+MVNC_EXPORT_API ncStatus_t ncFifoGetOption(struct ncFifoHandle_t* fifoHandle, int option,
+                           void *data, unsigned int *dataLength);
+
+
+MVNC_EXPORT_API ncStatus_t ncFifoDestroy(struct ncFifoHandle_t** fifoHandle);
+MVNC_EXPORT_API ncStatus_t ncFifoWriteElem(struct ncFifoHandle_t* fifoHandle, const void *inputTensor,
+                        unsigned int * inputTensorLength, void *userParam);
+MVNC_EXPORT_API ncStatus_t ncFifoReadElem(struct ncFifoHandle_t* fifoHandle, void *outputData,
+                        unsigned int* outputDataLen, void **userParam);
+MVNC_EXPORT_API ncStatus_t ncFifoRemoveElem(struct ncFifoHandle_t* fifoHandle); //not supported yet
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/mvnc/include/mvnc_ext.h b/inference-engine/thirdparty/movidius/mvnc/include/mvnc_ext.h
new file mode 100644 (file)
index 0000000..64f0f42
--- /dev/null
@@ -0,0 +1,47 @@
+#ifndef __NC_EXT_H_INCLUDED__
+#define __NC_EXT_H_INCLUDED__
+#include <mvnc.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef enum {
+    NC_NORESET = 0,
+    NC_RESET = 1
+} ncInitReset_t;
+
+/*
+ * @brief Boot device with firmware without creating handler for it
+ * @param devicePlatform Platform to boot
+ * @param customFirmwareDir Path to directory with firmware to load. If NULL, use default
+ */
+MVNC_EXPORT_API ncStatus_t ncDeviceLoadFirmware(const ncDevicePlatform_t devicePlatform, const char* customFirmwareDir);
+MVNC_EXPORT_API ncStatus_t ncDeviceLoadFirmwareWithPath(const char* unbooted_device_name, const char* fw_path);
+MVNC_EXPORT_API ncStatus_t ncPlatformInit(ncInitReset_t reset);
+MVNC_EXPORT_API ncStatus_t ncDeviceOpenBooted(struct ncDeviceHandle_t **deviceHandle, const char* deviceID);
+
+MVNC_EXPORT_API ncStatus_t ncFifoWriteIonElem(struct ncFifoHandle_t* fifo, const void *inputTensor,
+                          unsigned int *inputTensorLength, void *userParam);
+
+MVNC_EXPORT_API ncStatus_t ncFifoReadIonElem(struct ncFifoHandle_t* fifo, int output_shared_fd,
+                          unsigned int *outputDataLen, void **userParam);
+
+/*
+ * @brief Reset all devices
+ */
+MVNC_EXPORT_API ncStatus_t ncDeviceResetAll();
+
+MVNC_EXPORT_API ncStatus_t ncGraphGetInfoSize(const void* graphFile, size_t graphFileLength, ncUserGetInfo_t option, void* data, unsigned int* dataLength);
+
+MVNC_EXPORT_API ncStatus_t ncDeviceGetUnbootedName(struct ncDeviceHandle_t* deviceHandle, void* devAddr);
+
+MVNC_EXPORT_API ncStatus_t ncDeviceGetId(struct ncDeviceHandle_t* deviceHandle, void* deviceId);
+
+MVNC_EXPORT_API ncStatus_t ncDeviceHWReset(struct ncDeviceHandle_t* deviceHandle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __NC_EXT_H_INCLUDED__
diff --git a/inference-engine/thirdparty/movidius/mvnc/include/ncCommPrivate.h b/inference-engine/thirdparty/movidius/mvnc/include/ncCommPrivate.h
new file mode 100644 (file)
index 0000000..94bd369
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+// Includes
+// ----------------------------------------------------------------------------
+#ifndef _MVNC_COMM_H_
+#define _MVNC_COMM_H_
+#define NC_MAX_NAME_SIZE        28
+
+// TODO: #-17902
+struct tensorDescriptor_t {
+    uint32_t n;
+    uint32_t c;
+    uint32_t w;
+    uint32_t h;
+    uint32_t totalSize;
+    uint32_t widthStride;
+    uint32_t heightStride;
+    uint32_t channelsStride;
+};
+typedef enum {
+    NC_GRAPH_OK,
+    NC_GRAPH_WRONG_INPUT_FORMAT,
+    NC_GRAPH_MYRIAD_ERROR
+} ncGraphError_t;
+
+typedef enum {
+    GRAPH_WAITING_FOR_BUFFERS,
+    GRAPH_RUNNING
+} graphState_t;
+
+typedef enum {
+    GRAPH_MON_CLASS_GRAPH_CMD = 0,
+    GRAPH_MON_CLASS_BUFFER_CMD = 1,
+    GRAPH_MON_CLASS_GET_CLASS0 = 2,
+    GRAPH_MON_CLASS_GET_CLASS1 = 3,
+    GRAPH_MON_CLASS_GET_CLASS2 = 4,
+    GRAPH_MON_CLASS_GET_CLASS3 = 5,
+    GRAPH_MON_CLASS_SET_CLASS0 = 6,
+    GRAPH_MON_CLASS_SET_CLASS1 = 7,
+    GRAPH_MON_CLASS_SET_CLASS2 = 8,
+    GRAPH_MON_CLASS_SET_CLASS3 = 9,
+} graphMonClass_t;
+
+typedef enum {
+    GRAPH_VERIFY_CMD = 3,
+    GRAPH_ALLOCATE_CMD = 0,
+    GRAPH_DEALLOCATE_CMD = 1,
+    GRAPH_TRIGGER_CMD = 2,
+} graphCommandType_t;
+
+typedef enum {
+    CLASS0_TIMING_DATA = 0,
+    CLASS0_DEBUG_DATA = 1,
+    CLASS0_STATE = 2,
+} graphOptionClass0_t;
+typedef enum {
+    CLASS1_GR_NI = 0,
+} graphOptionClass1_t;
+typedef enum {
+    CLASS2_GR_NI = 0,
+} graphOptionClass2_t;
+typedef enum {
+    CLASS3_GR_NI = 0,
+} graphOptionClass3_t;
+
+typedef enum {
+    BUFFER_ALLOCATE_CMD = 0,
+    BUFFER_DEALLOCATE_CMD = 1,
+} bufferCommandType_t;
+
+typedef struct {
+    graphCommandType_t type;
+    uint32_t id;
+    char streamName[16];
+    uint32_t buffId1;
+    uint32_t buffId2;
+    uint32_t executors_number;
+    uint8_t laterUse[24];
+} graphCommand_t;
+
+typedef struct {
+    bufferCommandType_t type;
+    char name[NC_MAX_NAME_SIZE];
+    uint32_t id;
+    uint32_t elemCnt;
+    struct tensorDescriptor_t desc;
+    uint8_t readChannel;
+    uint8_t writeChannel;
+    uint8_t laterUse[10];
+} bufferCommand_t;
+
+typedef struct {
+    union {
+        graphOptionClass0_t c0;
+        graphOptionClass1_t c1;
+        graphOptionClass2_t c2;
+        graphOptionClass3_t c3;
+    } type;
+    uint32_t id;
+} graphOptionSet_t;
+
+typedef struct {
+    graphMonClass_t cmdClass;
+    union {
+        graphCommand_t graphCmd;
+        bufferCommand_t buffCmd;
+        graphOptionSet_t optionCmd;
+    } cmd;
+} graphMonCommand_t;
+
+typedef enum {
+    CLASS0_THERMAL_STATS = 1,
+    CLASS0_DEVICE_CAPABILITIES = 2,
+    CLASS0_DEVICE_USED_MEMORY = 3,
+    CLASS0_DEVICE_ID = 4,
+    /* constants for internal profiling below */
+    CLASS0_DEVICE_PROFILING_DATA = 10001,
+    CLASS0_DEVICE_QUERY_CLOCKS,
+} deviceOptionClass0;
+typedef enum {
+    CLASS1_WATCHDOG_PING = 0,
+} deviceOptionClass1;
+typedef enum {
+    CLASS2_GET_TEMP_LIM_LOWER = 0,
+    CLASS2_SET_TEMP_LIM_LOWER,
+    CLASS2_GET_TEMP_LIM_HIGHER,
+    CLASS2_SET_TEMP_LIM_HIGHER,
+    CLASS2_GET_BACKOFF_TIME_NORMAL,
+    CLASS2_SET_BACKOFF_TIME_NORMAL,
+    CLASS2_GET_BACKOFF_TIME_HIGH,
+    CLASS2_SET_BACKOFF_TIME_HIGH,
+    CLASS2_GET_BACKOFF_TIME_CRITICAL,
+    CLASS2_SET_BACKOFF_TIME_CRITICAL,
+    CLASS2_GET_TEMPERATURE_DEBUG,
+    CLASS2_SET_TEMPERATURE_DEBUG,
+    CLASS2_SET_STDIO_REDIRECT_XLINK,
+    CLASS2_OPT_LIST,
+} deviceOptionClass2;
+
+typedef enum {
+    CLASS3_START_SHELL = 0,
+    CLASS3_SET_LOG_LEVEL_GLOBAL,
+    CLASS3_SET_LOG_LEVEL_FATHOM,
+    CLASS3_SET_LOG_LEVEL_XLINK,
+} deviceOptionClass3;
+
+typedef struct {
+    union {
+        deviceOptionClass0 c0;
+        deviceOptionClass1 c1;
+        deviceOptionClass2 c2;
+        deviceOptionClass3 c3;
+    } type;
+    uint32_t optionClass;
+    uint32_t data;
+} deviceCommand_t;
+
+typedef struct {
+    uint32_t max_graphs;
+    uint32_t max_fifos;
+    uint32_t max_memory;
+    uint32_t max_device_opt_class;
+    uint32_t max_graph_opt_class;
+    uint32_t max_executors;
+    uint32_t fw_version[4];
+    uint32_t mv_tensor_version[2];
+} deviceCapabilities_t;
+
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/mvnc/include/ncPrivateTypes.h b/inference-engine/thirdparty/movidius/mvnc/include/ncPrivateTypes.h
new file mode 100644 (file)
index 0000000..a8ea882
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+* Copyright 2018-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+// Includes
+// ----------------------------------------------------------------------------
+
+#ifndef _NC_PRIVATE_TYPES_H_
+#define _NC_PRIVATE_TYPES_H_
+
+#if (defined(_WIN32) || defined(_WIN64))
+#include "win_pthread.h"
+#else
+#include <pthread.h>
+#endif
+#include <mvnc.h>
+#include "ncCommPrivate.h"
+#include "XLinkPublicDefines.h"
+#include "watchdog.h"
+
+typedef enum {
+    NC_OPTION_CLASS0 = 0,
+    NC_OPTION_CLASS1 = 1,
+    NC_OPTION_CLASS2 = 2,
+    NC_OPTION_CLASS3 = 3,
+} ncOptionClass_t;
+
+typedef enum {
+    NC_FIFO_HWC = 0, // row major - channel minor, for RGB image: RGB00, RGB01, RGB02,...
+                     // all RGB pixels by row
+    NC_FIFO_CHW = 1, // channel major - column minor (planar), for RGB image:
+                     // R01R02R03...G01G02G03...B01B02B03...
+                     // all Red rows..all Green rows..all Blue rows
+    NC_FIFO_HCW = 2, // row major - column minor (interleaved), for RGB image:
+                     // R00R01..R0k.., G00G01..G0k.., B00B01..B0k.., R10R11..R1k..
+                     // 1st Red row, 1st Green row, 1st Blue Rrw, 2nd Red row..
+    NC_FIFO_CWH = 3, // channel major - row minor, for RGB image:
+                     // R00R10R20... G00G10G20...B00B10B20...
+                     // all Red columns, all Green columns, all blue columns
+    NC_FIFO_WCH = 4, // column major - row minor; for RGB image:
+                     // R00R10..Rk0.., G00G10..Gk0.., B00B10..Bk0.., R01R11..Rk1..
+                     // 1st Red col, 1st Green col, 1st blue col, 2nd Red col...
+    NC_FIFO_WHC = 5, // column major - channle minor, for RGB image: RGB00, RGB10, RGB20...
+                     // all RGB pixels by col...
+} ncFifoLayout_t;
+
+struct _devicePrivate_t {
+    int throttle_happened;
+    float *thermal_stats;
+    char *dev_addr;     // Device USB address as returned by usb_
+    char *dev_addr_booted;
+    char *dev_file;     // Device filename in /dev directory
+    char *optimisation_list;
+    XLinkHandler_t *xlink;
+    struct _devicePrivate_t *next;  // Next device in chain
+    struct _graphPrivate_t *graphs; // List of associated graphs
+    struct _fifoPrivate_t *fifos;   // List of associated fifos
+    streamId_t device_mon_stream_id;
+    streamId_t graph_monitor_stream_id;
+    streamId_t printf_over_xlink_stream_id;
+    int        printf_over_xlink_conn_fd;
+    pthread_t  printf_over_xlink_thr;
+    int        printf_over_xlink_thr_valid;
+    pthread_mutex_t dev_data_m;
+    pthread_mutex_t dev_stream_m;
+    pthread_mutex_t graph_stream_m;
+    deviceCapabilities_t dev_attr;
+    ncDeviceState_t state;
+    uint32_t device_id;
+    uint32_t deviceFreq;
+    uint8_t* profilingBuffer;
+    size_t   receivedData;
+    wd_context watchdog_ctx;
+    int wd_interval;
+};
+
+extern devicePrivate_t *devices;
+
+struct _userParamPrivate_t {
+    void *data;
+    struct _userParamPrivate_t *next;
+};
+struct _graphPrivate_t {
+    uint32_t id;
+    uint32_t blob_version[2];
+    int started;
+    int batch_size;
+    int executors_number;
+    int input_count;
+    int output_count;
+    struct ncTensorDescriptor_t input_tensor_desc;
+    struct ncTensorDescriptor_t output_tensor_desc;
+    unsigned nstages;
+    int timingsCount;
+    struct _devicePrivate_t *dev;
+    struct _graphPrivate_t *next;
+    size_t aux_buffer_size;
+    char *aux_buffer;
+    char *debug_buffer;
+    char name[NC_MAX_NAME_SIZE];
+    float *time_taken;
+    streamId_t graph_stream_id;
+    ncGraphState_t state;
+};
+
+struct _fifoPrivate_t {
+    ncFifoType_t type;
+    ncFifoLayout_t graphLayout;
+    int consumer_cnt;
+    uint32_t id;
+    streamId_t streamId;
+    struct ncTensorDescriptor_t graph_tensor_desc;
+    struct ncTensorDescriptor_t host_tensor_desc;
+    struct _devicePrivate_t *dev;
+    struct _fifoPrivate_t *next;
+    char name[NC_MAX_NAME_SIZE];
+    struct _userParamPrivate_t *user_param_in;  //used for write fifo
+    struct _userParamPrivate_t *user_param_out; //used for read fifo
+    int host_tensor_desc_set;
+    int write_count;
+    int consumed_by_graph;
+    int num_elements;
+    int api_read_element;
+    int consumers_remaining;
+    int datasize;
+    int timeout_msec;
+    pthread_mutex_t fifo_mutex;
+    ncFifoState_t state;
+    void* output_data;
+};
+
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+#define PACKED(name) struct __attribute__((packed)) name
+#else
+#define PACKED( __Declaration__ ) __pragma( pack(push, 1) ) struct __Declaration__ __pragma( pack(pop) )
+#endif
+
+
+#define EI_NIDENT 16
+
+PACKED(ElfN_Ehdr
+{
+    uint8_t  e_ident[EI_NIDENT];
+    uint16_t e_type;
+    uint16_t e_machine;
+    uint32_t e_version;
+    uint32_t e_entry;
+    uint32_t e_phoff;
+    uint32_t e_shoff;
+    uint32_t e_flags;
+    uint16_t e_ehsize;
+    uint16_t e_phentsize;
+    uint16_t e_phnum;
+    uint16_t e_shentsize;
+    uint16_t e_shnum;
+    uint16_t e_shstrndx;
+};)
+
+PACKED(blob_header_v2
+{
+    uint32_t magic_number;              // =???, not used
+    uint32_t file_size;                 // size of blob? not used
+    uint32_t blob_ver_major;            // =???, not used
+    uint32_t blob_ver_minor;            // =???, not used
+    uint32_t bss_mem_size;
+    uint32_t mode;
+    uint32_t stage_section_offset;
+    uint32_t buffer_section_offset;     // must be aligned by 16 bytes
+    uint32_t relocation_section_offset;
+};)
+
+PACKED(stage_section_header_v2
+{
+    uint32_t stage_count;
+    uint32_t stage_section_size;    // not used
+    uint32_t input_size;
+    uint32_t output_size;
+    uint32_t batch_size;    
+};)
+
+
+
+
+
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/mvnc/src/97-myriad-usbboot.rules b/inference-engine/thirdparty/movidius/mvnc/src/97-myriad-usbboot.rules
new file mode 100644 (file)
index 0000000..c4e06d3
--- /dev/null
@@ -0,0 +1,3 @@
+SUBSYSTEM=="usb", ATTRS{idProduct}=="2150", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0660", ENV{ID_MM_DEVICE_IGNORE}="1"
+SUBSYSTEM=="usb", ATTRS{idProduct}=="2485", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0660", ENV{ID_MM_DEVICE_IGNORE}="1"
+SUBSYSTEM=="usb", ATTRS{idProduct}=="f63b", ATTRS{idVendor}=="03e7", GROUP="users", MODE="0660", ENV{ID_MM_DEVICE_IGNORE}="1"
diff --git a/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c b/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c
new file mode 100644 (file)
index 0000000..2c301f6
--- /dev/null
@@ -0,0 +1,3847 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/types.h>
+#if (defined(_WIN32) || defined(_WIN64))
+#include "gettime.h"
+#include <windows.h>    // for Sleep()
+#else
+#include <dlfcn.h>      // For dladdr
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/file.h>
+#include <errno.h>
+#endif
+#include <usb_boot.h>
+
+#include "mvnc.h"
+
+#include "XLink.h"
+#include "ncCommPrivate.h"
+#include "ncPrivateTypes.h"
+
+#define MVLOG_UNIT_NAME ncAPI
+#include "mvLog.h"
+#include "mvMacros.h"
+#include "watchdog.h"
+
+#define THERMAL_BUFFER_SIZE 100
+#define THERMAL_THROTTLING_BUFFER_SIZE (THERMAL_BUFFER_SIZE + sizeof(int))
+#define DEBUG_BUFFER_SIZE     120
+
+#define MAX_TENSORS_TO_LOAD (2)
+#define BLOB_STREAM_SIZE 4096
+#define TENSOR_STREAM_SIZE 320*1024   * MAX_TENSORS_TO_LOAD
+#define OUTPUT_STREAM_SIZE 8 //read only from PC
+
+#define CONFIG_STREAM_SIZE 2000
+
+#define MAX_PATH_LENGTH         255
+#define MAX_DEVICES             (32)
+
+//      Timeouts
+#define STATUS_WAIT_TIMEOUT     15
+#define DEVICE_APPEAR_TIMEOUT_ON_OPEN   (2)
+#define DEVICE_APPEAR_TIMEOUT_ON_CLOSE   (10)
+
+#define SLEEP_MS        250
+#define MAX_ITERATIONS  20
+
+#define GRAPH_CLASS0_BASE   1000
+#define DEVICE_CLASS0_BASE  2000
+#define OPTION_CLASS_SIZE   100
+
+#define FP16_DATA_SIZE 2
+
+static int initialized = 0;
+static int reset_all = 1;
+
+pthread_mutex_t deviceOpenMutex = PTHREAD_MUTEX_INITIALIZER;
+
+#if (defined(_WIN32) || defined(_WIN64))
+static HANDLE global_lock_fd = NULL;
+static OVERLAPPED global_lock_overlap = { 0 };
+#define GLOBAL_LOCK() LockFileEx(global_lock_fd, LOCKFILE_EXCLUSIVE_LOCK, 0, MAXDWORD, MAXDWORD, &global_lock_overlap)
+#define GLOBAL_UNLOCK() UnlockFileEx(global_lock_fd, 0, MAXDWORD, MAXDWORD, &global_lock_overlap)
+#else
+static int global_lock_fd = -1;
+#define GLOBAL_LOCK() flock(global_lock_fd, LOCK_EX)
+#define GLOBAL_UNLOCK() flock(global_lock_fd, LOCK_UN)
+#endif
+
+#ifndef CHECK_MUTEX_SUCCESS
+#define CHECK_MUTEX_SUCCESS(call)  {                                \
+    int error;                                                      \
+    if ((error = (call))) {                                         \
+      mvLog(MVLOG_ERROR, "%s failed with error: %d", #call, error); \
+    }                                                               \
+}
+#endif  // CHECK_MUTEX_SUCCESS
+
+#ifndef CHECK_MUTEX_SUCCESS_RC
+#define CHECK_MUTEX_SUCCESS_RC(call, rc)  {                         \
+    int error;                                                      \
+    if ((error = (call))) {                                         \
+      mvLog(MVLOG_ERROR, "%s failed with error: %d", #call, error); \
+      return rc;                                                    \
+    }                                                               \
+}
+#endif  // CHECK_MUTEX_SUCCESS_RC
+
+#ifndef CHECK_HANDLE_CORRECT
+#define CHECK_HANDLE_CORRECT(handle)  {                             \
+    if (!handle) {                                                  \
+      mvLog(MVLOG_ERROR, "%s is NULL", #handle);                    \
+      return NC_INVALID_HANDLE;                                     \
+    }                                                               \
+}
+#endif  // CHECK_HANDLE_CORRECT
+
+#ifndef CHECK_HANDLE_CORRECT_RC
+#define CHECK_HANDLE_CORRECT_RC(handle, rc)  {                      \
+    if (!handle) {                                                  \
+      mvLog(MVLOG_ERROR, "%s is NULL", #handle);                    \
+      return rc;                                                    \
+    }                                                               \
+}
+#endif  // CHECK_HANDLE_CORRECT
+
+#ifndef CHECK_HANDLE_CORRECT_WINFO
+#define CHECK_HANDLE_CORRECT_WINFO(handle, logLevel, printMessage) {\
+    if (!handle) {                                                  \
+      mvLog(logLevel, "%s", printMessage);                          \
+      return NC_INVALID_HANDLE;                                     \
+    }                                                               \
+}
+#endif  // CHECK_HANDLE_CORRECT_WINFO
+
+// To suppress warning in the macro below
+#pragma GCC diagnostic ignored "-Wformat-extra-args"
+
+/**
+ * @brief The macro checks a stream id passed to it
+ * @param id Stream id to check
+ * @param callReleasingResources if it is needed to release resource in case of error, put your code of releasing
+ *        to { you code here }. If no need to release resource pass {} to the parameter
+ * @param errorMsg Message to be written in case of error. It is a format string
+ */
+#ifndef CHECK_STREAM_ID
+#define CHECK_STREAM_ID(id, callReleasingResources, errorMsg) {                                                   \
+    char errorMsgWithReason[255];                                                                                  \
+    if (id == INVALID_STREAM_ID_OUT_OF_MEMORY) {                                                                   \
+        snprintf(errorMsgWithReason, 255, "%s %s", errorMsg, "due to not enough memory on device");                \
+        mvLog(MVLOG_ERROR, errorMsgWithReason);                                                                    \
+        callReleasingResources;                                                                                        \
+        return NC_OUT_OF_MEMORY;                                                                                   \
+    } else if (id == INVALID_STREAM_ID) {                                                                          \
+         snprintf(errorMsgWithReason, 255, "%s %s", errorMsg, "due to unknown error");              \
+         callReleasingResources;                                                                                       \
+         return NC_ERROR;                                                                                          \
+    }                                                                                                              \
+    mvLog(MVLOG_DEBUG, "Stream opened");                                                                           \
+}
+#endif // CHECK_STREAM_ID
+
+static XLinkGlobalHandler_t ghandler;
+
+#define TRACE_SIZE (24)
+
+static const int profUpperBound = 64 * 4 * 1024 * TRACE_SIZE;
+
+devicePrivate_t *devices;
+
+/////////////////////////// Structs /////////////////////////////
+
+static ncStatus_t parseXLinkError(XLinkError_t rc) {
+    switch (rc) {
+    case X_LINK_SUCCESS:
+        return NC_OK;
+    case X_LINK_DEVICE_NOT_FOUND:
+        return NC_DEVICE_NOT_FOUND;
+    case X_LINK_TIMEOUT:
+        return NC_TIMEOUT;
+    default:
+        return NC_ERROR;
+    }
+}
+
+static char* platformToStr(const ncDevicePlatform_t platform) {
+    switch(platform) {
+        case MYRIAD_2:              return "MYRIAD_2";
+        case MYRIAD_X:              return "MYRIAD_X";
+        default:                    return "UNKNOWN_PLATFORM";
+    }
+}
+
+static char* ncStatusToStr(const ncStatus_t status) {
+    switch(status) {
+    case NC_OK:                 return "NC_OK";
+    case NC_BUSY:               return "NC_BUSY";
+    case NC_OUT_OF_MEMORY:      return "NC_OUT_OF_MEMORY";
+    case NC_DEVICE_NOT_FOUND:   return "NC_DEVICE_NOT_FOUND";
+    case NC_INVALID_PARAMETERS: return "NC_INVALID_PARAMETERS";
+    case NC_TIMEOUT:            return "NC_TIMEOUT";
+    case NC_MVCMD_NOT_FOUND:    return "NC_MVCMD_NOT_FOUND";
+    case NC_NOT_ALLOCATED:      return "NC_NOT_ALLOCATED";
+    case NC_UNAUTHORIZED:       return "NC_UNAUTHORIZED";
+    case NC_UNSUPPORTED_GRAPH_FILE: return "NC_UNSUPPORTED_GRAPH_FILE";
+    case NC_UNSUPPORTED_CONFIGURATION_FILE: return "NC_UNSUPPORTED_CONFIGURATION_FILE";
+    case NC_UNSUPPORTED_FEATURE: return "NC_UNSUPPORTED_FEATURE";
+    case NC_MYRIAD_ERROR:       return "NC_MYRIAD_ERROR";
+    case NC_INVALID_DATA_LENGTH: return "NC_INVALID_DATA_LENGTH";
+    case NC_INVALID_HANDLE:     return "NC_INVALID_HANDLE";
+    default: return "NC_ERROR";
+    }
+}
+
+static char* XLinkErrorToStr(XLinkError_t rc) {
+    switch (rc) {
+    case X_LINK_SUCCESS:        return "X_LINK_SUCCESS";
+    case X_LINK_ALREADY_OPEN:   return "X_LINK_ALREADY_OPEN";
+    case X_LINK_COMMUNICATION_NOT_OPEN: return "X_LINK_COMMUNICATION_NOT_OPEN";
+    case X_LINK_COMMUNICATION_FAIL: return "X_LINK_COMMUNICATION_FAIL";
+    case X_LINK_COMMUNICATION_UNKNOWN_ERROR: return "X_LINK_COMMUNICATION_UNKNOWN_ERROR";
+    case X_LINK_DEVICE_NOT_FOUND: return "X_LINK_DEVICE_NOT_FOUND";
+    case X_LINK_TIMEOUT:         return "X_LINK_TIMEOUT";
+    case X_LINK_ERROR:
+    default:
+        return "X_LINK_ERROR";
+    }
+}
+
+static double timeInSeconds()
+{
+    static double s;
+    struct timespec ts;
+
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    if (!s)
+        s = ts.tv_sec + ts.tv_nsec * 1e-9;
+    return ts.tv_sec + ts.tv_nsec * 1e-9 - s;
+}
+
+static void sleepForSeconds(const unsigned int seconds) {
+#if (!defined(_WIN32) && !defined(_WIN64))
+    sleep(seconds);
+#else
+    Sleep(seconds * 1000); // Sleep using miliseconds as input
+#endif
+}
+
+static char* getProductName(const char* name)
+{
+    char* p = strchr(name, '-');
+    if (p == NULL)
+        return "";
+    return p;
+}
+
+static ncOptionClass_t getOptionClass(int option, int base)
+{
+    return (int) ((option - base) / OPTION_CLASS_SIZE);
+}
+
+#if (defined(_WIN32) || defined(_WIN64) )
+#define MAX_2(a,b)             ((a) > (b) ? (a) : (b))
+#define MAX_3(a,b,c)   ((a) > (b) ? MAX_2((a), (c)) : MAX_2((b), (c)))
+#ifdef MAX
+#undef MAX
+#define MAX MAX_2
+#endif
+#else
+#define MAX_3(a,b,c)                            \
+    ({ __typeof__ (a) _a = (a);                 \
+        __typeof__ (b) _b = (b);                \
+        __typeof__ (c) _c = (c);                \
+        (_a > _b && _a > _c) ? _a : ((_b > _c && _b > _a) ? _b : _c); })
+#endif
+
+static ncFifoLayout_t getLayout(struct ncTensorDescriptor_t* td) {
+    unsigned int max = MAX_3(td->hStride, td->wStride, td->cStride);
+    if (max == td->hStride) {
+        if (MAX(td->wStride, td->cStride) == td->wStride)
+            return NC_FIFO_HWC;
+        else
+            return NC_FIFO_HCW;
+    } else if (max == td->cStride) {
+        if (MAX(td->wStride, td->hStride) == td->hStride)
+            return NC_FIFO_CHW;
+        else
+            return NC_FIFO_CWH;
+    } else { //W is major
+        if (MAX(td->hStride, td->cStride) == td->hStride)
+            return NC_FIFO_WHC;
+        else
+            return NC_FIFO_WCH;
+    }
+}
+
+void printImg(unsigned char* inputTensor, struct ncTensorDescriptor_t* inputDesc) {
+    int c = 0;
+    for (; c < inputDesc->c; c++) {
+        int row = 0;
+        for (; row < inputDesc->h; row++) { //row
+            int col = 0;
+            for (; col < inputDesc->w; col++) {
+                printf("%x ", inputTensor[col + row * inputDesc->hStride +
+                        c * inputDesc->cStride]);
+            }
+            printf(" ===== ROW %d (channel %d) Done === \n", row, c);
+        }
+        printf("\n");
+    }
+}
+
+static void resetAll()
+{
+#if defined(USE_PCIE) || defined(NO_BOOT)
+    mvLog(MVLOG_INFO, "Devices will not be restarted for this configuration (PCIE or NO_BOOT)");
+#else
+    int index = 0;
+    int stalled_count = 0;
+    int iters = 0;
+    int bootrom_count = 0;
+    int after_reset_count = 0;
+    char name[NC_MAX_NAME_SIZE] = "";
+    XLinkError_t rc;
+
+    double waittm = timeInSeconds() + STATUS_WAIT_TIMEOUT;
+    while (timeInSeconds() < waittm) {
+        rc = XLinkGetDeviceName(index, name, NC_MAX_NAME_SIZE, AUTO_PID);
+        if (rc != X_LINK_SUCCESS)
+            break; //no more devices found
+
+        if (strlen(getProductName(name)) == 1) { //name doesn't have product number
+            //device is already booted, need to reset
+            mvLog(MVLOG_DEBUG,"Found stalled device %s\n", name);
+            XLinkHandler_t* handler = calloc(1, sizeof(XLinkHandler_t));
+
+            if (!handler){
+                mvLog(MVLOG_ERROR, "Memory allocation failed");
+                break;
+            }
+            handler->devicePath = (char*)name;
+            rc = XLinkConnect(handler);
+            if (rc) {
+                mvLog(MVLOG_ERROR," Failed to connect to stalled device, rc: %s", XLinkErrorToStr(rc));
+            }
+            stalled_count++;
+            free(handler);
+
+        } else {
+            bootrom_count++;
+        }
+        index++;
+    }
+
+    if (stalled_count) {
+        mvLog(MVLOG_INFO,"Stalled devices found, Reseting...");
+        rc = XLinkResetAll();
+        if (rc) {
+            mvLog(MVLOG_WARN,"Failed to reset all device, rc: %s", XLinkErrorToStr(rc));
+        }
+
+        iters = 0;
+
+        while ((after_reset_count < bootrom_count + stalled_count) &&
+                iters < MAX_ITERATIONS) {
+            usleep(SLEEP_MS*1000);
+            after_reset_count = 0;
+            index = 0;
+            waittm = timeInSeconds() + STATUS_WAIT_TIMEOUT;
+            while (timeInSeconds() < waittm) {
+                XLinkError_t rc = XLinkGetDeviceName(index, name, NC_MAX_NAME_SIZE, AUTO_PID);
+                if (rc != X_LINK_SUCCESS)
+                break; //no more devices found
+
+                if (strlen(getProductName(name)) > 1) { //name has product number
+                    after_reset_count++;
+                }
+                index++;
+            }
+            iters++;
+            mvLog(MVLOG_INFO,"...");
+        }
+        usleep(SLEEP_MS*1000);
+    }
+#endif
+}
+
+static ncStatus_t initializeXLink();
+
+ncStatus_t ncDeviceResetAll() {
+#if defined(USE_PCIE) || defined(NO_BOOT)
+    mvLog(MVLOG_INFO, "Devices will not be restarted for this configuration (PCIE or NO_BOOT)");
+#else
+    if (!initialized) {
+        ncStatus_t sc;
+        if ((sc = initializeXLink()) != 0) {
+            return sc;
+        }
+    }
+    resetAll();
+#endif
+    return NC_OK;
+}
+
+static ncStatus_t initializeXLink()
+{
+    XLinkSetCommonTimeOutMsec(3 * 60 * 10000);
+    // We sanitize the situation by trying to reset the devices that have been left open
+    initialized = 1;
+    devices = NULL;
+#ifdef USE_PCIE
+    ghandler.protocol = PCIE;
+#else
+    ghandler.protocol = USB_VSC;
+#endif  // USE_PCIE
+
+    int sc = XLinkInitialize(&ghandler);
+    if (sc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR," Initialization failed, rc = %s\n", XLinkErrorToStr(sc));
+        return NC_ERROR;
+    }
+
+#if !(defined(USE_PCIE) || defined(NO_BOOT))
+    if (reset_all) {
+        resetAll();
+    }
+#endif  // USE_PCIE
+    return NC_OK;
+}
+
+/**
+ * @brief Extract pid for not booted device
+ */
+static int getPidBy(const ncDevicePlatform_t platform) {
+    switch (platform) {
+        case MYRIAD_2: return DEFAULT_UNBOOTPID_2150;
+        case MYRIAD_X: return DEFAULT_UNBOOTPID_2485;
+        default:       return AUTO_UNBOOTED_PID;
+    }
+}
+
+static int isDeviceOpened(const char *name)
+{
+    struct _devicePrivate_t *d = devices;
+    while (d) {
+        if (strcmp(d->dev_addr, name) == 0)
+            return 0;
+        d = d->next;
+    }
+    return -1;
+}
+
+/**
+ * @brief Check is path exists (directory or file)
+ */
+static int isPathExists(const char* filePath) {
+    return ( access( filePath, 0 ) != -1 ) ? 1 : 0;
+}
+
+static char getPathSeparator() {
+#ifdef _WIN32
+    return '\\';
+#else
+    return '/';
+#endif
+}
+
+/**
+ * @brief Add / or \\ at the end of the path, if doesn't have it
+ */
+
+static void addEndPathSeparator(char* filePath) {
+    const int filePathLen = strnlen(filePath, MAX_PATH_LENGTH);
+    if (filePathLen > 1 && filePathLen != MAX_PATH_LENGTH && filePath[filePathLen - 1] != getPathSeparator()) {
+        filePath[filePathLen] = getPathSeparator();
+        filePath[filePathLen + 1] = 0;
+    }
+}
+
+ncStatus_t getFirmwarePath(char* mv_cmd_file_path, const char* dev_addr) {
+
+    if (!mv_cmd_file_path || !dev_addr) {
+        return NC_INVALID_PARAMETERS;
+    }
+
+    char *p;
+    char mv_cmd_file_name[40] = "MvNCAPI-maXXXX.mvcmd";
+
+    // Search the mvnc executable in the same directory of this library
+    // in the future there will ideally be one FW file for all, for now they are seperate
+    const char* pr = getProductName(dev_addr);
+    if (pr == NULL || strlen(pr) <= 1) {
+        mvLog(MVLOG_WARN, "Can't get product name");
+        GLOBAL_UNLOCK();
+        return NC_ERROR;
+    }
+
+    // Get firmware name
+    snprintf(mv_cmd_file_name, 40, "MvNCAPI%s.mvcmd", pr);
+    mvLog(MVLOG_DEBUG, "Firmware name %s\n", mv_cmd_file_name);
+
+    // If mv_cmd_file_path contain path, use it.
+    // It's case when mv_cmd_file_path was set by ncDeviceOpen custom path argument
+    if (strlen(mv_cmd_file_path) > 1) {
+        addEndPathSeparator(mv_cmd_file_path);
+    } else {
+        // Get dll full path
+#if (defined(_WIN32) || defined(_WIN64))
+        HMODULE hm = NULL;
+        if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                                  GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                              (LPCSTR) "ncDeviceOpen", &hm)) {
+            int ret = GetLastError();
+            fprintf(stderr, "GetModuleHandle returned %d", ret);
+        }
+        GetModuleFileNameA(hm, mv_cmd_file_path, MAX_PATH_LENGTH - 1);
+#else
+        Dl_info info;
+        dladdr(ncDeviceOpen, &info);
+        strncpy(mv_cmd_file_path, info.dli_fname, MAX_PATH_LENGTH - 40);
+        mv_cmd_file_path[MAX_PATH_LENGTH - 1] = 0;
+#endif
+    }
+
+    p = strrchr(mv_cmd_file_path, getPathSeparator());
+
+    if (p)
+        strcpy(p + 1, mv_cmd_file_name);
+    else
+        strcpy(mv_cmd_file_path, mv_cmd_file_name);
+    mv_cmd_file_path[MAX_PATH_LENGTH - 1] = 0;
+
+    if (!isPathExists(mv_cmd_file_path)) {
+        mvLog(MVLOG_ERROR, "Firmware not found in: %s", mv_cmd_file_path);
+
+        // Firmware also could be in "mvnc" subdirectory
+        char mv_cmd_file_with_subdirectory[100] = "mvnc/";
+        char *p_sub = strrchr(mv_cmd_file_with_subdirectory, '/');
+
+        if (!p_sub)
+            return NC_MVCMD_NOT_FOUND;
+
+        strcpy(p_sub + 1, mv_cmd_file_name);
+        if (p)
+            strcpy(p + 1, mv_cmd_file_with_subdirectory);
+        else
+            strcpy(mv_cmd_file_path, mv_cmd_file_with_subdirectory);
+
+        // Is firmware was found in /mvnc subdir
+        if (!isPathExists(mv_cmd_file_path)) {
+            return NC_MVCMD_NOT_FOUND;
+        } else {
+            mvLog(MVLOG_WARN, "Firmware was found in: %s", mv_cmd_file_path);
+        }
+    }
+
+    mvLog(MVLOG_DEBUG, "File path %s\n", mv_cmd_file_path);
+    return 0;
+}
+
+static ncStatus_t getDevAttributes(struct _devicePrivate_t *d);
+static void printfOverXLinkOpen(struct _devicePrivate_t *d);
+static void printfOverXLinkClose(struct _devicePrivate_t *d);
+static ncStatus_t destroyDeviceHandle(struct ncDeviceHandle_t **deviceHandlePtr);
+
+ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
+        ncDevicePlatform_t platform, int watchdogInterval,
+        const char* customFirmwareDirectory) {
+
+    CHECK_HANDLE_CORRECT_RC(deviceHandlePtr, NC_INVALID_PARAMETERS);
+    if (watchdogInterval < 0) {
+        mvLog(MVLOG_ERROR, "Invalid watchdogInterval");
+        return NC_INVALID_PARAMETERS;
+    }
+
+#ifdef NO_BOOT
+    int noBoot = 1;
+
+    if (watchdogInterval > 0) {
+        mvLog(MVLOG_INFO, "Watchdog for already booted device would be disabled");
+        watchdogInterval = 0;
+    }
+
+#else
+    int noBoot = 0;
+#endif
+
+    if (*deviceHandlePtr && (*deviceHandlePtr)->private_data->state == NC_DEVICE_OPENED) {
+        mvLog(MVLOG_WARN, "Device was already opened");
+        return NC_OK;
+    }
+
+    // Initialize handler
+
+    if (!initialized) {
+#if (defined(_WIN32) || defined(_WIN64))
+        char* tempPath = getenv("TEMP");
+        if (tempPath) {
+            char *path = malloc(strlen(tempPath) + 15);
+            if (!path) {
+                return NC_OUT_OF_MEMORY;
+            }
+            strcpy(path, tempPath);
+            strcat(path, "\\mvnc.mutex");
+            global_lock_fd = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+            free(path);
+        }
+
+        if (!global_lock_fd) {
+            mvLog(MVLOG_ERROR, "global mutex initialization failed");
+            exit(1);
+        }
+#else
+        global_lock_fd = open("/tmp/mvnc.mutex", O_CREAT, 0660);
+        if (global_lock_fd == -1) {
+            mvLog(MVLOG_ERROR, "global mutex initialization failed");
+            exit(1);
+        }
+#endif
+    }
+
+    char name[NC_MAX_NAME_SIZE] = "";
+
+    // If trying open already booted device, we should not reset_all device on
+    if (noBoot) {
+        mvLog(MVLOG_INFO, "Connect to already booted device");
+        reset_all = 0;
+    }
+
+    GLOBAL_LOCK();
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&deviceOpenMutex), NC_ERROR);
+    if (!initialized) {
+        ncStatus_t sc;
+        if ((sc = initializeXLink()) != 0) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+            GLOBAL_UNLOCK();
+            return sc;
+        }
+    }
+    XLinkError_t rc = X_LINK_ERROR;
+
+    // Behavior for PCIE and connection to booted device is same
+    if (ghandler.protocol == PCIE) {
+        noBoot = 1;
+    }
+
+    int pid = getPidBy(platform);
+
+    if (noBoot)
+        pid = DEFAULT_OPENPID;
+
+    // Find any unbooted device or booted device and create deviceHandle
+
+    // PCIE could be found at once. Otherwise, it would cause a lot of errors about the opening file error.
+    if (ghandler.protocol != PCIE) {
+        double waittm = timeInSeconds() + DEVICE_APPEAR_TIMEOUT_ON_OPEN;
+        while ((rc != X_LINK_SUCCESS) && (timeInSeconds() < waittm)) {
+            rc = XLinkGetDeviceName(0, name, NC_MAX_NAME_SIZE, pid);
+        }
+    } else {
+        // PCIE ignore pid value
+        rc = XLinkGetDeviceName(0, name, NC_MAX_NAME_SIZE, DEFAULT_OPENPID);
+    }
+
+    if (rc != X_LINK_SUCCESS) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+        GLOBAL_UNLOCK();
+        mvLog(MVLOG_ERROR, "Failed to find suitable device, rc: %s", XLinkErrorToStr(rc));
+        return parseXLinkError(NC_ERROR);
+    }
+
+    // Allocate handler
+
+    struct ncDeviceHandle_t *dH = calloc(1, sizeof(*dH));
+    struct _devicePrivate_t *d = calloc(1, sizeof(*d));
+
+    if (dH && d) {
+        dH->private_data = d;
+        d->dev_addr = strdup(name);
+        d->device_mon_stream_id = INVALID_LINK_ID;
+        d->graph_monitor_stream_id = INVALID_LINK_ID;
+        d->wd_interval = watchdogInterval;
+        *deviceHandlePtr = dH;
+    } else {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+        GLOBAL_UNLOCK();
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        free(d);
+        free(dH);
+        return NC_OUT_OF_MEMORY;
+    }
+
+    if (d->dev_addr == NULL) {
+        destroyDeviceHandle(deviceHandlePtr);
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+        GLOBAL_UNLOCK();
+        return NC_OUT_OF_MEMORY;
+    }
+
+    // Boot device
+    XLinkHandler_t* handler = calloc(1, sizeof(XLinkHandler_t));
+    if (!handler) {
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        destroyDeviceHandle(deviceHandlePtr);
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+        GLOBAL_UNLOCK();
+        return NC_OUT_OF_MEMORY;
+    }
+
+    handler->devicePath = (char*)d->dev_addr;
+
+    if (!noBoot) {
+        // Find firmware and boot device with it
+        char mv_cmd_file_path[MAX_PATH_LENGTH] = "\0";
+
+        // If have firmware directory path as function input, use it
+        if (customFirmwareDirectory && strnlen(customFirmwareDirectory, MAX_PATH_LENGTH) > 1) {
+            strncpy(mv_cmd_file_path, customFirmwareDirectory, MAX_PATH_LENGTH - 1);
+            addEndPathSeparator(mv_cmd_file_path);
+            mv_cmd_file_path[MAX_PATH_LENGTH - 1] = '\0';
+        }
+
+        ncStatus_t sc;
+
+        if ((sc = getFirmwarePath(mv_cmd_file_path, d->dev_addr)) != 0) {
+            mvLog(MVLOG_ERROR, "Can't get firmware, error: %s", ncStatusToStr(sc));
+            free(handler);
+            destroyDeviceHandle(deviceHandlePtr);
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+            GLOBAL_UNLOCK();
+            return NC_MVCMD_NOT_FOUND;
+        }
+
+        mvLog(MVLOG_INFO, "%s() XLinkBootRemote is running for %s...\n", __func__, d->dev_addr);
+
+        // remember all currently available devices
+        char beforeBootDevices[MAX_DEVICES][NC_MAX_NAME_SIZE] = {{0}};
+        int n = 0;
+        for (; n < MAX_DEVICES; ++n) {
+            if (XLinkGetDeviceName(n, beforeBootDevices[n], NC_MAX_NAME_SIZE - 1, AUTO_PID))
+                break;
+        }
+
+        rc = XLinkBootRemote(d->dev_addr, mv_cmd_file_path);
+        if (rc) {
+            mvLog(MVLOG_WARN, "%s() XLinkBootRemote returned error %s for %s",
+                  __func__, XLinkErrorToStr(rc), d->dev_addr);
+        } else {
+            mvLog(MVLOG_INFO, "%s() XLinkBootRemote returned success %s for %s",
+                  __func__, XLinkErrorToStr(rc), d->dev_addr);
+        }
+
+        char booted_name[NC_MAX_NAME_SIZE] = "";
+
+        // After boot name should change
+        double waittm = timeInSeconds() + STATUS_WAIT_TIMEOUT;
+        int deviceBooted = 0;
+        while ((timeInSeconds() < waittm) && !deviceBooted) {
+            int dev_indx = 0;
+            for (; dev_indx < MAX_DEVICES; ++dev_indx) {
+                rc = XLinkGetDeviceName(dev_indx, booted_name, NC_MAX_NAME_SIZE, AUTO_PID);
+                booted_name[NC_MAX_NAME_SIZE - 1] = '\0';
+                if (rc != X_LINK_SUCCESS)
+                    break;
+
+                // if beforeBootDevices contains booted_name this is not a device we are looking for
+                int not_found = 0;
+                n = 0;
+                for (; n < MAX_DEVICES; ++n) {
+                    if (strcmp(booted_name, beforeBootDevices[n]) == 0) {
+                        not_found = 1;
+                        break;
+                    }
+                }
+
+                if (not_found)
+                    continue;
+                handler->devicePath = (char *) booted_name;
+
+                rc = XLinkConnect(handler);
+                // Device mustn't be in devices pool
+                if (isDeviceOpened(booted_name) < 0 && rc == X_LINK_SUCCESS) {
+                    deviceBooted = 1;
+                    d->dev_addr_booted = strdup(booted_name);
+                    break;
+                }
+            }
+        }
+    } else {    // !noBoot
+        d->dev_addr_booted = strdup(d->dev_addr);
+        handler->devicePath = d->dev_addr_booted;
+        rc = XLinkConnect(handler);
+    }
+
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR, "Failed connection to device (%s) with error %d", d->dev_addr, rc);
+        free(handler);
+        destroyDeviceHandle(deviceHandlePtr);
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+        GLOBAL_UNLOCK();
+        return parseXLinkError(rc);
+    }
+    mvLog(MVLOG_INFO, "XLinkConnect done - link Id %d\n", handler->linkId);
+
+    int error = 0;
+    if ((error = pthread_mutex_init(&d->dev_data_m, NULL)) != 0) {
+        mvLog(MVLOG_ERROR, "pthread_mutex_init (dev_data_m) failed with error: %d", error);
+        free(handler);
+        destroyDeviceHandle(deviceHandlePtr);
+        return NC_ERROR;
+    }
+    // If current mutex initialization failed, destroy previous
+    if ((error = pthread_mutex_init(&d->dev_stream_m, NULL)) != 0) {
+        mvLog(MVLOG_ERROR, "pthread_mutex_init (dev_stream_m) failed with error: %d", error);
+        CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_data_m));
+        free(handler);
+        destroyDeviceHandle(deviceHandlePtr);
+        return NC_ERROR;
+    }
+    if ((error = pthread_mutex_init(&d->graph_stream_m, NULL)) != 0) {
+        mvLog(MVLOG_ERROR, "pthread_mutex_init (graph_stream_m) failed with error: %d", error);
+        CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_data_m));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_stream_m));
+        free(handler);
+        destroyDeviceHandle(deviceHandlePtr);
+        return NC_ERROR;
+    }
+
+    d->xlink = handler;
+    d->next = devices;
+    devices = d;
+
+    if (ghandler.protocol != PCIE) {
+        mvLog(MVLOG_INFO, "Booted %s (%s) -> %s\n",
+              d->dev_addr, d->dev_addr_booted,
+              d->dev_file ? d->dev_file : "VSC");
+    } else {
+        mvLog(MVLOG_INFO, "Booted %s -> %s\n",
+              d->dev_addr, d->dev_file ? d->dev_file : "PCIE");
+    }
+
+    sleepForSeconds(1);
+
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
+    GLOBAL_UNLOCK();
+
+    streamId_t streamId = XLinkOpenStream(d->xlink->linkId, "deviceMonitor", CONFIG_STREAM_SIZE);
+    CHECK_STREAM_ID(streamId, {}, "can't open deviceMonitor stream");
+
+    d->device_mon_stream_id = streamId;
+
+#if !(defined(USE_PCIE) || defined(NO_BOOT))
+    watchdog_init_context(&d->watchdog_ctx);
+    watchdog_register_device(&d->watchdog_ctx, d);
+#endif
+
+    getDevAttributes(d);
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+    printfOverXLinkOpen(d);
+#endif
+
+    streamId = XLinkOpenStream(d->xlink->linkId, "graphMonitor",
+                                BLOB_STREAM_SIZE);
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+    CHECK_STREAM_ID(streamId, {
+           printfOverXLinkClose(d);
+    }, "can't open graphMonitor stream");
+#else
+    CHECK_STREAM_ID(streamId, {}, "can't open graphMonitor stream");
+#endif
+
+    d->graph_monitor_stream_id = streamId;
+    d->state = NC_DEVICE_OPENED;
+
+    return NC_OK;
+}
+
+ncStatus_t ncDeviceLoadFirmware(const ncDevicePlatform_t devicePlatform, const char* customFirmwareDir) {
+#if defined(USE_PCIE)
+    mvLog(MVLOG_WARN, "Firmware for PCIe can't be loaded with this application");
+    return NC_ERROR;
+#else
+    mvLog(MVLOG_WARN, "Boot (%s) without connecting to it", platformToStr(devicePlatform));
+    XLinkError_t rc;
+    ncStatus_t sc;
+
+    // Find device with specific platform
+    char deviceName[NC_MAX_NAME_SIZE];
+    rc = XLinkGetDeviceName(0, deviceName, NC_MAX_NAME_SIZE, getPidBy(devicePlatform));
+    if (rc) {
+        mvLog(MVLOG_WARN, "Failed to find (%s) platform device", platformToStr(devicePlatform));
+        return NC_DEVICE_NOT_FOUND;
+    }
+
+    // Find firmware
+    char mv_cmd_file_path[MAX_PATH_LENGTH] = "\0";
+    if (customFirmwareDir && strnlen(customFirmwareDir, MAX_PATH_LENGTH) > 1) {
+        strncpy(mv_cmd_file_path, customFirmwareDir, MAX_PATH_LENGTH - 1);
+        addEndPathSeparator(mv_cmd_file_path);
+        if (!isPathExists(customFirmwareDir)) {
+            return NC_MVCMD_NOT_FOUND;
+        }
+    }
+
+    if ((sc = getFirmwarePath(mv_cmd_file_path, deviceName)) != 0) {
+        mvLog(MVLOG_ERROR, "Can't get firmware, error: %s", ncStatusToStr(sc));
+        return NC_MVCMD_NOT_FOUND;
+    }
+
+    mvLog(MVLOG_INFO, "Trying to boot %s device", deviceName);
+    rc = XLinkBootRemote(deviceName, mv_cmd_file_path);
+    if (rc) {
+        mvLog(MVLOG_WARN, "%s() XLinkBootRemote returned error %s\n", __func__, XLinkErrorToStr(rc));
+    } else {
+        mvLog(MVLOG_INFO, "%s() XLinkBootRemote returned success %s\n", __func__, XLinkErrorToStr(rc));
+             sleepForSeconds(DEVICE_APPEAR_TIMEOUT_ON_OPEN);
+    }
+
+    return parseXLinkError(rc);
+#endif
+}
+
+static ncStatus_t getDevAttributes(struct _devicePrivate_t *d) {
+    XLinkError_t rc = X_LINK_SUCCESS;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    deviceCommand_t config;
+    config.type.c0 = CLASS0_DEVICE_CAPABILITIES;
+    config.optionClass = NC_OPTION_CLASS0;
+    rc = XLinkWriteData(d->device_mon_stream_id, (const uint8_t*)&config, sizeof(config));
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR, "Failed to write data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    streamPacketDesc_t* packet = 0;
+    rc = XLinkReadData(d->device_mon_stream_id, &packet);
+    if (rc != X_LINK_SUCCESS || !packet) {
+        mvLog(MVLOG_ERROR, "Failed to read data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    if(packet->length != sizeof(d->dev_attr)) {
+        mvLog(MVLOG_ERROR, "Broken protocol. DevData can't be read\n");
+        if (XLinkReleaseData(d->device_mon_stream_id) != X_LINK_SUCCESS) {
+            mvLog(MVLOG_WARN, "Failed to release data\n");
+        }
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    d->dev_attr = *(deviceCapabilities_t*)packet->data;
+    rc = XLinkReleaseData(d->device_mon_stream_id);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_WARN, "Failed to release data, rc: %s", XLinkErrorToStr(rc));
+    }
+    mvLog(MVLOG_INFO, "Device attributes\n");
+    mvLog(MVLOG_INFO, "Device FW version: %x.%x.%x.%x\n", d->dev_attr.fw_version[0],
+          d->dev_attr.fw_version[1], d->dev_attr.fw_version[2], d->dev_attr.fw_version[3]);
+    mvLog(MVLOG_INFO, "Maximum graphs: %d\n", d->dev_attr.max_graphs);
+    mvLog(MVLOG_INFO, "Maximum fifos: %d\n", d->dev_attr.max_fifos);
+    mvLog(MVLOG_INFO, "Maximum graph option class: %d\n", d->dev_attr.max_graph_opt_class);
+    mvLog(MVLOG_INFO, "Maximum device option class: %d\n", d->dev_attr.max_device_opt_class);
+    mvLog(MVLOG_INFO, "Device memory capacity: %d\n", d->dev_attr.max_memory);
+    return NC_OK;
+}
+
+static ncStatus_t getThermalStats(struct _devicePrivate_t *d){
+    if (!d->thermal_stats){
+        d->thermal_stats = calloc(THERMAL_THROTTLING_BUFFER_SIZE, 1);
+        if (!d->thermal_stats)
+            return NC_OUT_OF_MEMORY;
+    }
+    deviceCommand_t config;
+    config.type.c0 = CLASS0_THERMAL_STATS;
+    config.optionClass = NC_OPTION_CLASS0;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    XLinkError_t rc = X_LINK_SUCCESS;
+    rc = XLinkWriteData(d->device_mon_stream_id, (const uint8_t*)&config, sizeof(config));
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR, "Failed to write data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    streamPacketDesc_t* packet = 0;
+    rc = XLinkReadData(d->device_mon_stream_id, &packet);
+    if (rc != X_LINK_SUCCESS || !packet) {
+        mvLog(MVLOG_ERROR, "Failed to read data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    if( packet->length != THERMAL_THROTTLING_BUFFER_SIZE) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    memcpy(d->thermal_stats, packet->data, packet->length);
+    rc = XLinkReleaseData(d->device_mon_stream_id);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_WARN,"Failed to release data, rc: %s", XLinkErrorToStr(rc));
+    }
+    return NC_OK;
+}
+
+static ncStatus_t getDeviceFrequency(struct _devicePrivate_t *d){
+    deviceCommand_t config;
+    config.type.c0 = CLASS0_DEVICE_QUERY_CLOCKS;
+    config.optionClass = NC_OPTION_CLASS0;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    XLinkError_t rc = X_LINK_SUCCESS;
+    rc = XLinkWriteData(d->device_mon_stream_id, (const uint8_t*)&config, sizeof(config));
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR, "Failed to write data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    streamPacketDesc_t* packet = 0;
+
+    rc = XLinkReadData(d->device_mon_stream_id, &packet);
+    if (rc != X_LINK_SUCCESS || !packet) {
+        mvLog(MVLOG_ERROR, "Failed to read data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+
+    if( packet->length != sizeof(uint32_t)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    memcpy(&d->deviceFreq, packet->data, packet->length);
+    rc = XLinkReleaseData(d->device_mon_stream_id);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_WARN,"Failed to release data, rc: %s", XLinkErrorToStr(rc));
+    }
+    return NC_OK;
+}
+
+static ncStatus_t getDeviceProfilingData(struct _devicePrivate_t *d){
+    deviceCommand_t config;
+    config.type.c0 = CLASS0_DEVICE_PROFILING_DATA;
+    config.optionClass = NC_OPTION_CLASS0;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    XLinkError_t rc = X_LINK_SUCCESS;
+    rc = XLinkWriteData(d->device_mon_stream_id, (const uint8_t*)&config, sizeof(config));
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_ERROR, "Failed to write data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+    streamPacketDesc_t* packet = 0;
+
+    rc = XLinkReadData(d->device_mon_stream_id, &packet);
+    if (rc != X_LINK_SUCCESS || !packet) {
+        mvLog(MVLOG_ERROR, "Failed to read data, rc: %s", XLinkErrorToStr(rc));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return parseXLinkError(rc);
+    }
+
+    d->receivedData = packet->length;
+    if (d->profilingBuffer == 0) {
+        d->profilingBuffer = (uint8_t*) malloc(profUpperBound);
+    }
+
+    if( packet->length > profUpperBound) {
+        d->receivedData = profUpperBound;
+    }
+    memcpy(d->profilingBuffer, packet->data, d->receivedData);
+    rc = XLinkReleaseData(d->device_mon_stream_id);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    if (rc != X_LINK_SUCCESS) {
+        mvLog(MVLOG_WARN,"Failed to release data, rc: %s", XLinkErrorToStr(rc));
+    }
+    return NC_OK;
+}
+
+static ncStatus_t deviceGetDeviceMemory(struct _devicePrivate_t *d,
+                                        uint32_t * mem)
+{
+    deviceCommand_t config;
+    config.type.c0 = CLASS0_DEVICE_USED_MEMORY;
+    config.optionClass = NC_OPTION_CLASS0;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    if (XLinkWriteData(d->device_mon_stream_id, (const uint8_t *) &config,
+                       sizeof(config)) != 0) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    streamPacketDesc_t *packet = 0;
+
+    if (XLinkReadData(d->device_mon_stream_id, &packet) != 0 || !packet) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+
+    if (packet->length != (sizeof(uint32_t))) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    memcpy(mem, packet->data, packet->length);
+    XLinkReleaseData(d->device_mon_stream_id);
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    return NC_OK;
+}
+
+static ncStatus_t deviceSetStdIO2XLink(struct _devicePrivate_t *d, uint32_t data)
+{
+    deviceCommand_t config;
+    config.type.c2 = CLASS2_SET_STDIO_REDIRECT_XLINK;
+    config.optionClass = NC_OPTION_CLASS2;
+    config.data = data;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&d->dev_stream_m), NC_ERROR);
+    if (XLinkWriteData(d->device_mon_stream_id, (const uint8_t *) &config,
+                       sizeof(config)) != 0) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_stream_m));
+        return NC_ERROR;
+    }
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&d->dev_stream_m), NC_ERROR);
+    return NC_OK;
+}
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+static void fprintfsock( int s, const char* fmt, ... ) {
+    char* buf = NULL;
+    const char* ptext;
+    int len;
+    va_list args;
+    if(fmt == NULL) {
+        va_start( args, fmt);
+        ptext = va_arg(args,const char*);
+        len = va_arg(args, int);
+        va_end(args);
+    } else {
+        va_start( args, fmt );
+        len = vsnprintf( 0, 0, fmt, args ) + 1;
+        buf = (char*) malloc(len);
+        if (buf == NULL) {
+            va_end (args);
+            return;
+        }
+        va_start( args, fmt );
+        vsnprintf( buf, len, fmt, args );
+        va_end (args);
+        ptext = buf;
+    }
+
+    if(s < 0) {
+        (void) write( 1, ptext, len );
+    } else {
+        if(send( s, ptext, len, 0 ) < 0)
+        {
+            fprintf(stderr,"WARNING in fprintfsock: not all data has been sent\n");
+        }
+    }
+
+    if(buf)
+        free( buf );
+}
+
+static void* debugConsoleThreadReader(void* ctx) {
+    struct _devicePrivate_t *d = (struct _devicePrivate_t *) ctx;
+    streamId_t streamId = d->printf_over_xlink_stream_id;
+    int connfd = d->printf_over_xlink_conn_fd;
+    streamPacketDesc_t * packet;
+    XLinkError_t xerr;
+
+    fprintfsock(connfd, "XLinkConsole receiving loop begins\n");
+    fprintfsock(connfd, "=========================================\n");
+    while(1){
+        // use 0 as the timeout to prevent trigger false reset
+        xerr = XLinkReadDataWithTimeOut(streamId, &packet, 0);
+        if(X_LINK_SUCCESS != xerr || packet == NULL)
+            break;
+        fprintfsock(connfd, NULL, packet->data, packet->length);
+        XLinkReleaseData(streamId);
+    }
+    fprintfsock(connfd, "=========================================\n"
+                        "Session closed (%d)\n", xerr);
+    close(connfd);
+    return NULL;
+}
+
+static void printfOverXLinkClose(struct _devicePrivate_t *d) {
+    if(d->printf_over_xlink_stream_id != INVALID_STREAM_ID) {
+        /* Tell device stop redirect STDIO to XLink Console */
+        deviceSetStdIO2XLink(d, 0);
+        XLinkCloseStream(d->printf_over_xlink_stream_id);
+        d->printf_over_xlink_stream_id = INVALID_STREAM_ID;
+    }
+
+    if(d->printf_over_xlink_thr_valid) {
+        pthread_cancel(d->printf_over_xlink_thr);
+        d->printf_over_xlink_thr_valid = 0;
+    }
+
+    if(d->printf_over_xlink_conn_fd >= 0) {
+        close(d->printf_over_xlink_conn_fd);
+        d->printf_over_xlink_conn_fd = -1;
+    }
+}
+
+// FIXME: update the function below to use mvLog instead of printf for consistency: #16773
+static void printfOverXLinkOpen(struct _devicePrivate_t *d) {
+    int linkId = d->xlink->linkId;
+    const char * streamName = "console";
+    streamId_t streamId = INVALID_STREAM_ID;
+    char * cfg_use_xlink_printf = NULL;
+
+    d->printf_over_xlink_stream_id = INVALID_STREAM_ID;
+    d->printf_over_xlink_conn_fd = -1;
+    d->printf_over_xlink_thr_valid = 0;
+
+    /* export XLINK_PRINTF=1 to enable this feature */
+    cfg_use_xlink_printf = getenv("XLINK_PRINTF");
+    if(cfg_use_xlink_printf == NULL)
+        return;
+    if(strcmp(cfg_use_xlink_printf, "1") != 0)
+        return;
+
+    /* Tell device redirect STDIO to XLink Console */
+    deviceSetStdIO2XLink(d, 1);
+
+    streamId = XLinkOpenStream(linkId, streamName, 10*1024);
+    if(streamId == INVALID_STREAM_ID) {
+        fprintf(stderr,"ERROR in XLinkOpenStream: %s\n", streamName);
+        return;
+    }
+
+    const char * servername = "localhost";
+    struct hostent *server;
+    struct sockaddr_in serv_addr;
+
+    server = gethostbyname(servername);
+    if (server == NULL) {
+        fprintf(stderr,"ERROR in gethostbyname: %s\n", servername);
+        return;
+    }
+
+    int portNum = 7788;
+    int connfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+    bzero((char *) &serv_addr, sizeof(serv_addr));
+    serv_addr.sin_family = AF_INET;
+    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr, server->h_length);
+    serv_addr.sin_port = htons(portNum);
+
+    /* Now connect to the server */
+    if (connect(connfd, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0) {
+        printf("WARNNING: Cannot connect to XlinkPrintf debug console server, will print in current console instead\n");
+        // even when no debug server, we still need drain possible debug information out of the XLink
+        // or it will hang
+        close(connfd);
+        connfd = -1;
+    }
+
+    d->printf_over_xlink_stream_id = streamId;
+    d->printf_over_xlink_conn_fd = connfd;
+    if(pthread_create(&d->printf_over_xlink_thr, NULL, debugConsoleThreadReader, (void*) d)){
+        fprintf(stderr,"ERROR in creating XlinkPrintf debug console reader thread!\n");
+        printfOverXLinkClose (d);
+    }else {
+        d->printf_over_xlink_thr_valid = 1;
+    }
+}
+
+#endif
+
+
+static int findDevice(struct _devicePrivate_t *deviceHandle)
+{
+
+    struct _devicePrivate_t *d = devices;
+
+    while (d) {
+        if (d == deviceHandle)
+            return 0;
+        d = d->next;
+    }
+
+    return -1;
+}
+
+static int deviceGetNumberOfGraphs(struct _devicePrivate_t *deviceHandle)
+{
+    if (deviceHandle == NULL)
+        return 0;
+    int num = 0;
+    struct _graphPrivate_t *g = deviceHandle->graphs;
+    while (g) {
+        num++;
+        g = g->next;
+    }
+    return num;
+}
+
+static int deviceGetNumberOfFifos(struct _devicePrivate_t *deviceHandle)
+{
+    if (deviceHandle == NULL)
+        return 0;
+    int num = 0;
+    struct _fifoPrivate_t *f = deviceHandle->fifos;
+    while (f) {
+        num++;
+        f = f->next;
+    }
+    return num;
+}
+
+static int findGraph(struct _graphPrivate_t *graphHandle)
+{
+    struct _devicePrivate_t *d = devices;
+
+    while (d) {
+        struct _graphPrivate_t *g = d->graphs;
+        while (g) {
+            if (g == graphHandle)
+                return 0;
+            g = g->next;
+        }
+        d = d->next;
+    }
+
+    return -1;
+}
+
+// Defined here as it will be used twice
+static int deallocateGraph(struct _graphPrivate_t *g)
+{
+    int found = 0;
+    if (!g) {
+        return -!found;
+    }
+    // Remove it from the list of the associated device
+    if (g->dev->graphs == g) {
+        g->dev->graphs = g->next;
+        found = 1;
+    } else {
+        struct _graphPrivate_t *gp = g->dev->graphs;
+        while (gp->next) {
+            if (gp->next == g) {
+                found = 1;
+                gp->next = gp->next->next;
+                break;
+            }
+            gp = gp->next;
+        }
+    }
+
+    // Free it with all its data
+    if (found) {
+        free(g->aux_buffer);
+    }
+    g->state = NC_GRAPH_DEALLOCATED;
+    return -!found;
+}
+
+static int findFifo(struct _fifoPrivate_t *f)
+{
+    if (!f || !f->dev)
+        return 0;
+
+    if (f->dev->fifos == f) {
+        return 1;
+    } else {
+        struct _fifoPrivate_t *fp = f->dev->fifos;
+        while (fp->next) {
+            if (fp->next == f) {
+                return 1;
+            }
+            fp = fp->next;
+        }
+    }
+    return 0;
+}
+
+static int deallocateFifo(struct _fifoPrivate_t *f)
+{
+    int found = 0;
+    if (!f) {
+        return -!found;
+    }
+    // Remove it from the list of the associated device
+    if (f->dev->fifos == f) {
+        f->dev->fifos = f->next;
+        found = 1;
+    } else {
+        struct _fifoPrivate_t *fp = f->dev->fifos;
+        while (fp->next) {
+            if (fp->next == f) {
+                found = 1;
+                fp->next = fp->next->next;
+                break;
+            }
+            fp = fp->next;
+        }
+    }
+
+    // Free it with all its data
+    if (found) {
+        //deallocate on device
+        XLinkCloseStream(f->streamId);
+        struct _userParamPrivate_t *temp;
+        while (f->user_param_in) {
+            temp = f->user_param_in;
+            f->user_param_in = f->user_param_in->next;
+            free(temp);
+        }
+        while (f->user_param_out) {
+            temp = f->user_param_out;
+            f->user_param_out = f->user_param_out->next;
+            free(temp);
+        }
+    }
+    f->state = NC_FIFO_DEALLOCATED;
+    return -!found;
+}
+
+static ncStatus_t destroyDeviceHandle(struct ncDeviceHandle_t **deviceHandlePtr) {
+    if (!deviceHandlePtr) {
+        mvLog(MVLOG_ERROR, "Handle is NULL");
+        return NC_INVALID_HANDLE;
+    }
+    if (!(*deviceHandlePtr)) {
+        mvLog(MVLOG_INFO, "Handle already destroyed");
+        return NC_OK;
+    }
+
+    mvLog(MVLOG_INFO, "Destroying device handler");
+
+    struct _devicePrivate_t *d = (*deviceHandlePtr)->private_data;
+
+    if(d->next) {
+        mvLog(MVLOG_WARN, "Device could be in mvnc devices list");
+    }
+
+    free(d->thermal_stats);
+    free(d->dev_addr);
+    free(d->dev_addr_booted);
+
+    free(d->dev_file);
+    free(d->optimisation_list);
+
+    free(d->xlink);
+
+    free(d->profilingBuffer);
+
+    free(d);
+    (*deviceHandlePtr)->private_data = NULL;
+    free((*deviceHandlePtr));
+    *deviceHandlePtr = NULL;
+
+    return NC_OK;
+}
+
+ncStatus_t ncDeviceClose(struct ncDeviceHandle_t **deviceHandlePtr) {
+    int found = 0;
+    XLinkError_t rc = X_LINK_SUCCESS;
+
+    if (!deviceHandlePtr) {
+        mvLog(MVLOG_ERROR, "Handle is NULL");
+        return NC_INVALID_HANDLE;
+    }
+    if (!(*deviceHandlePtr)) {
+        mvLog(MVLOG_INFO, "Handle already destroyed");
+        return NC_OK;
+    }
+
+    struct _devicePrivate_t *d = (*deviceHandlePtr)->private_data;
+    if (!d) {
+        mvLog(MVLOG_ERROR, "Device has been destroyed");
+        return NC_INVALID_HANDLE;
+    }
+
+    int wasConnectedToBooted = 0;
+    if (d->dev_addr != NULL && d->dev_addr_booted != NULL &&
+        strncmp(d->dev_addr, d->dev_addr_booted, NC_MAX_NAME_SIZE) == 0) {
+        wasConnectedToBooted = 1;       // For PCIE that also would work
+    }
+
+    GLOBAL_LOCK();
+    if (findDevice(d)) {
+        GLOBAL_UNLOCK();
+        return NC_INVALID_PARAMETERS;
+    }
+    mvLog(MVLOG_INFO, "Removing device...");
+
+    // Remove it from our list
+    if (devices == d) {
+        devices = d->next;
+        found = 1;
+    } else {
+        struct _devicePrivate_t *dp = devices;
+        while (dp->next) {
+            if (dp->next == d) {
+                found = 1;
+                dp->next = dp->next->next;
+                break;
+            }
+            dp = dp->next;
+        }
+    }
+    d->next = NULL;
+
+    if (!found) {
+        GLOBAL_UNLOCK();
+        return NC_INVALID_PARAMETERS;
+    }
+    // Deallocate all associated graphs
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->dev_data_m));
+    if (d->graphs) {
+        mvLog(MVLOG_WARN,
+              "Graphs on the device hasn't been destroyed! Graphs will be deallocated");
+        while (deallocateGraph(d->graphs) != -1) {
+            mvLog(MVLOG_INFO, "Graph was deallocated");
+        }
+    }
+    // Deallocate all associated fifos
+    if (d->fifos) {
+        mvLog(MVLOG_WARN,
+              "Fifos on the device hasn't been destroyed! Fifos will be deallocated");
+        while (deallocateFifo(d->fifos) != -1) {
+            mvLog(MVLOG_INFO, "Fifo was deallocated");
+        }
+    }
+
+#if (!defined(_WIN32) && !defined(_WIN64))
+    printfOverXLinkClose(d);
+#endif
+
+    if (d->state != NC_DEVICE_FAILED) {
+        // #17801
+#if !(defined(USE_PCIE) || defined(NO_BOOT))
+        if (d->device_mon_stream_id != INVALID_LINK_ID) {
+            rc = XLinkCloseStream(d->device_mon_stream_id);
+            if (rc)
+                mvLog(MVLOG_WARN,"Failed to close stream, rc: %s", XLinkErrorToStr(rc));
+        }
+        if (d->graph_monitor_stream_id != INVALID_LINK_ID) {
+            rc = XLinkCloseStream(d->graph_monitor_stream_id);
+            if (rc)
+                mvLog(MVLOG_WARN,"Failed to close stream, rc: %s", XLinkErrorToStr(rc));
+        }
+#endif
+        // Reset device
+        // In case when we open already booted device (or PCIE), just close connection to device
+        rc = XLinkResetRemote(d->xlink->linkId);
+        if (wasConnectedToBooted) {
+            mvLog(MVLOG_INFO, "Only device handle will be released and link to device closed");
+            if (rc)
+                mvLog(MVLOG_WARN, "Failed to close link to device, rc: %s", XLinkErrorToStr(rc));
+        } else {
+            if (rc)
+                mvLog(MVLOG_WARN, "Failed to reset, rc: %s", XLinkErrorToStr(rc));
+        }
+    }
+
+#if !(defined(USE_PCIE) || defined(NO_BOOT))
+    watchdog_unregister_device(&d->watchdog_ctx);
+#endif
+
+    d->state = NC_DEVICE_CLOSED;
+
+    CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->graph_stream_m));
+    CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_stream_m));
+
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_data_m));
+    CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_data_m));
+
+    if (!wasConnectedToBooted) {
+        int device_appear_after_reboot = 0;
+
+        //  Wait for unbooted device appear in usb list
+        double waittm = timeInSeconds() + DEVICE_APPEAR_TIMEOUT_ON_CLOSE;
+        while (timeInSeconds() < waittm) {
+            // check current devices
+            // wait for booted name to disappear
+            // wait for unbooted name to appear
+            // sometimes both names can be present in the list of usb devices
+            char device_name[NC_MAX_NAME_SIZE] = "";
+            int booted_disappeared = 1;
+            int unbooted_appeared = 0;
+
+            int n = 0;
+            while (XLinkGetDeviceName(n++, device_name, NC_MAX_NAME_SIZE - 1, AUTO_PID) == X_LINK_SUCCESS) {
+                if (d->dev_addr_booted != NULL &&
+                    strcmp(device_name, d->dev_addr_booted) == 0) {
+                    booted_disappeared = 0;
+                    break;
+                }
+
+                if (d->dev_addr != NULL &&
+                    strcmp(device_name, d->dev_addr) == 0) {
+                    unbooted_appeared = 1;
+                }
+            }
+
+            if (!(booted_disappeared && unbooted_appeared)) {
+                continue;
+            } else {
+                device_appear_after_reboot = 1;
+                break;
+            }
+        }
+
+        if (device_appear_after_reboot == 0) {
+            mvLog(MVLOG_ERROR, "Device didn't appear after reboot");
+        }
+    } else {
+        // #16971
+        sleepForSeconds(2);
+    }
+
+    ncStatus_t status = destroyDeviceHandle(deviceHandlePtr);
+    GLOBAL_UNLOCK();
+    if (status != NC_OK)
+        mvLog(MVLOG_WARN, "Destroying device handle failed with error %s", ncStatusToStr(status));
+    return status;
+}
+
+ncStatus_t ncGraphCreate(const char *name,
+                         struct ncGraphHandle_t ** graphHandle)
+{
+    if ((!name) || (!graphHandle)) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    struct ncGraphHandle_t *gH = calloc(1, sizeof(*gH));
+    struct _graphPrivate_t *g = calloc(1, sizeof(*g));
+
+    if (!gH || !g) {
+        free(g);
+        free(gH);
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        return NC_OUT_OF_MEMORY;
+    }
+
+    gH->private_data = g;
+    strncpy(g->name, name, NC_MAX_NAME_SIZE - 1);
+    g->name[NC_MAX_NAME_SIZE - 1] = '\0';
+    g->batch_size = 1;
+    g->dev = NULL;
+    g->executors_number = 1;
+    g->started = 0;
+    g->state = NC_GRAPH_CREATED;
+    *graphHandle = gH;
+    return NC_OK;
+}
+
+ncStatus_t sendGraphMonitorRequest(streamId_t graphMonStream, graphMonCommand_t *cmd) {
+    XLinkError_t rc = XLinkWriteData(graphMonStream, (uint8_t*)cmd, sizeof(*cmd));
+    if (rc)
+        return parseXLinkError(rc);
+    return NC_OK;
+}
+
+ncStatus_t checkGraphMonitorResponse(streamId_t graphMonStream) {
+    streamPacketDesc_t *ack = NULL;
+    XLinkError_t rc = X_LINK_SUCCESS;
+    rc = XLinkReadData(graphMonStream, &ack);
+    if (rc) {
+        mvLog(MVLOG_ERROR, "XLink error, rc: %s", XLinkErrorToStr(rc));
+        return parseXLinkError(rc);
+    }
+
+    int value = 0;
+    if (ack) {
+        value = *((int*)ack->data);
+    } else {
+        mvLog(MVLOG_ERROR, "Error with stream packet");
+        return NC_ERROR;
+    }
+
+    rc = XLinkReleaseData(graphMonStream);
+    if (rc) {
+        mvLog(MVLOG_ERROR, "XLink error, rc: %s", XLinkErrorToStr(rc));
+    }
+    if (value != 0){
+        mvLog(MVLOG_ERROR, "Graph monitor request returned error %d", value);
+        return NC_MYRIAD_ERROR;
+    }
+
+    return NC_OK;
+}
+
+static void lockAllInferences() {
+    struct _devicePrivate_t *d = devices;
+    while (d) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->graph_stream_m));
+        d = d->next;
+    }
+    return;
+}
+
+static void unlockAllInferences() {
+    struct _devicePrivate_t *d = devices;
+    while (d) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        d = d->next;
+    }
+    return;
+}
+
+ncStatus_t ncGraphAllocate(struct ncDeviceHandle_t * deviceHandle,
+                           struct ncGraphHandle_t * graphHandle,
+                           const void *graphBuffer,
+                           unsigned int graphBufferLength,
+                           const void *graphHeader,
+                           unsigned int graphHeaderLength)
+{
+    CHECK_HANDLE_CORRECT(deviceHandle);
+    CHECK_HANDLE_CORRECT(graphHandle);
+    CHECK_HANDLE_CORRECT(graphHeader);
+    CHECK_HANDLE_CORRECT(graphBuffer);
+
+    ncStatus_t rc = NC_OK;
+    XLinkError_t xl_error = X_LINK_SUCCESS;
+    mvLog(MVLOG_INFO, "Starting Graph allocation sequence\n");
+
+
+
+    if (graphHeaderLength > graphBufferLength) {
+        mvLog(MVLOG_ERROR, "graphHeaderLength > graphBufferLength");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    static int graphIdCount = 0;
+    struct _graphPrivate_t *g = graphHandle->private_data;
+
+    struct _devicePrivate_t *d = devices;
+    if (graphBufferLength > d->dev_attr.max_memory) {
+        mvLog(MVLOG_ERROR, "The graph file is bigger than the device memory");
+        return NC_OUT_OF_MEMORY;
+    }
+
+    GLOBAL_LOCK();
+    while (d) {
+        if (d == deviceHandle->private_data)
+            break;
+        d = d->next;
+    }
+    //TODO: review lists of devices and graphs internally.
+    //TODO: check if the graph is not already on the device
+    if (!d) {
+        GLOBAL_UNLOCK();
+        mvLog(MVLOG_ERROR, "Device not found!");
+        return NC_INVALID_PARAMETERS;
+    }
+    GLOBAL_UNLOCK();
+
+    lockAllInferences();
+    g->id = graphIdCount++;
+    streamId_t streamId;
+
+    if (g->executors_number > d->dev_attr.max_executors) {
+        mvLog(MVLOG_ERROR, "Executors number is greater than max allowed!");
+        unlockAllInferences();
+        return NC_INVALID_PARAMETERS;
+    }
+
+    graphMonCommand_t cmd;
+    cmd.cmdClass = GRAPH_MON_CLASS_GRAPH_CMD;
+    cmd.cmd.graphCmd.type = GRAPH_VERIFY_CMD;
+    snprintf(cmd.cmd.graphCmd.streamName, 16, "graphBuffer%d", g->id);
+    streamId = XLinkOpenStream(d->xlink->linkId, cmd.cmd.graphCmd.streamName, graphBufferLength);
+    CHECK_STREAM_ID(streamId, unlockAllInferences(), "can't open stream for graphBuffer transmission");
+
+    cmd.cmd.graphCmd.id = g->id;
+    cmd.cmd.graphCmd.executors_number = g->executors_number;
+
+    if((rc = sendGraphMonitorRequest(d->graph_monitor_stream_id, &cmd)) != 0){
+        mvLog(MVLOG_ERROR, "can't send graph allocation command");
+        unlockAllInferences();
+        return rc;
+    }
+    xl_error = XLinkWriteData(streamId, graphHeader, graphHeaderLength);
+    if (xl_error) {
+        mvLog(MVLOG_ERROR, "can't send graph header data to device, rc: %s", XLinkErrorToStr(xl_error));
+        unlockAllInferences();
+        return parseXLinkError(xl_error);
+    }
+    // for now simple status code used for graph header analysis result
+    if ((rc = checkGraphMonitorResponse(d->graph_monitor_stream_id)) != 0) {
+        mvLog(MVLOG_ERROR, "can't receive graph header verification response");
+        unlockAllInferences();
+        return rc;
+    }
+
+    // now sending whole graph with same header
+    cmd.cmd.graphCmd.type = GRAPH_ALLOCATE_CMD;
+
+    if(sendGraphMonitorRequest(d->graph_monitor_stream_id, &cmd)){
+        mvLog(MVLOG_ERROR, "can't send graph allocation command");
+        unlockAllInferences();
+        return NC_ERROR;
+    }
+    xl_error = XLinkWriteData(streamId, graphBuffer, graphBufferLength);
+    if (xl_error) {
+        mvLog(MVLOG_ERROR, "can't send graph data to device, rc: %s", XLinkErrorToStr(xl_error));
+        unlockAllInferences();
+        return parseXLinkError(xl_error);
+    }
+    mvLog(MVLOG_INFO, "Sent graph");
+    streamPacketDesc_t * tensorDescIn = 0;
+    streamPacketDesc_t * tensorDescOut = 0;
+    streamPacketDesc_t * nstages = 0;
+
+
+    xl_error = XLinkReadData(streamId, &tensorDescIn);
+    if (xl_error) {
+        mvLog(MVLOG_ERROR, "Can't read input tensor descriptors of the graph, rc: %s", XLinkErrorToStr(xl_error));
+        unlockAllInferences();
+        return parseXLinkError(xl_error);
+    }
+    xl_error = XLinkReadData(streamId, &tensorDescOut);
+    if (xl_error) {
+        mvLog(MVLOG_ERROR, "Can't read output tensor descriptors of the graph, rc: %s", XLinkErrorToStr(xl_error));
+        unlockAllInferences();
+        return parseXLinkError(xl_error);
+    }
+    xl_error = XLinkReadData(streamId, &nstages);
+    if (xl_error || nstages == NULL) {
+        mvLog(MVLOG_WARN, "Can't read nstages, rc: %s", XLinkErrorToStr(xl_error));
+        unlockAllInferences();
+        return parseXLinkError(xl_error);
+    }
+    // for now, support only count 1
+    if(!tensorDescIn ||
+        tensorDescIn->length % sizeof(struct tensorDescriptor_t) ||
+        tensorDescIn->length / sizeof(struct tensorDescriptor_t) > 1) {
+        mvLog(MVLOG_ERROR, "Input tensor descriptors of the graph are invalid\n");
+        if (tensorDescIn)
+            mvLog(MVLOG_ERROR, "Received data from graph %d\n", *(int*)tensorDescIn->data);
+        rc = NC_MYRIAD_ERROR;
+    }
+    // for now, support only count 1
+    if(!tensorDescOut ||
+        tensorDescOut->length % sizeof(struct tensorDescriptor_t) ||
+        tensorDescOut->length / sizeof(struct tensorDescriptor_t) > 1) {
+        mvLog(MVLOG_ERROR, "Output tensor descriptors of the graph are invalid\n");
+        rc = NC_MYRIAD_ERROR;
+    }
+    if (rc == NC_OK){
+        g->input_count = tensorDescIn->length / sizeof(struct tensorDescriptor_t);
+        memcpy(&g->input_tensor_desc, tensorDescIn->data,
+               sizeof(struct tensorDescriptor_t));
+        g->output_count = tensorDescOut->length / sizeof(struct tensorDescriptor_t);
+        memcpy(&g->output_tensor_desc, tensorDescOut->data,
+               sizeof(struct tensorDescriptor_t));
+        g->nstages = *(uint32_t*)nstages->data;
+        g->batch_size = g->input_tensor_desc.n;
+        g->timingsCount = g->nstages + 2;       // For time_receive timing and thread execution
+    }
+
+    xl_error = XLinkReleaseData(streamId);
+    if (xl_error)
+        mvLog(MVLOG_WARN, "Can't release data, rc: %s", XLinkErrorToStr(xl_error));
+
+    xl_error = XLinkReleaseData(streamId);
+    if (xl_error)
+        mvLog(MVLOG_WARN, "Can't release data, rc: %s", XLinkErrorToStr(xl_error));
+
+    xl_error = XLinkReleaseData(streamId);
+    if (xl_error)
+        mvLog(MVLOG_WARN, "Can't release data, rc: %s", XLinkErrorToStr(xl_error));
+
+    g->graph_stream_id = streamId;
+    if(checkGraphMonitorResponse(d->graph_monitor_stream_id)) {
+        mvLog(MVLOG_ERROR, "The device didn't accept the graph\n");
+        unlockAllInferences();
+        return NC_ERROR;
+    }
+    if (rc) {
+        unlockAllInferences();
+        return rc;
+    }
+
+    // aux_buffer
+    g->aux_buffer = calloc(1, 224 + g->timingsCount * sizeof(*g->time_taken));
+    if (!g->aux_buffer) {
+        unlockAllInferences();
+        return NC_OUT_OF_MEMORY;
+    }
+    // output_data
+
+    g->debug_buffer = g->aux_buffer;
+    g->time_taken = (float *) (g->aux_buffer + 120);
+    unlockAllInferences();
+
+    GLOBAL_LOCK();
+    g->dev = d;
+
+    if (d->graphs)
+        g->next = d->graphs;
+    d->graphs = g;
+    g->state = NC_GRAPH_ALLOCATED;
+    GLOBAL_UNLOCK();
+    mvLog(MVLOG_INFO, "Graph allocation completed successfully\n");
+
+    return NC_OK;
+}
+
+ncStatus_t ncGraphDestroy(struct ncGraphHandle_t ** graphHandle)
+{
+    CHECK_HANDLE_CORRECT(graphHandle);
+
+    struct ncGraphHandle_t *gh = *graphHandle;
+    if (!gh) {
+        mvLog(MVLOG_INFO, "handle is already destroyed");
+        return NC_OK;
+    }
+    struct _graphPrivate_t *g = gh->private_data;
+    CHECK_HANDLE_CORRECT_WINFO(g, MVLOG_ERROR, "Graph handle is corrupt or has been destroyed")
+
+    if (g->state == NC_GRAPH_CREATED || g->state == NC_GRAPH_DEALLOCATED) {
+        free(g);
+        gh->private_data = NULL;
+        free(gh);
+        *graphHandle = NULL;
+        return NC_OK;
+    }
+    GLOBAL_LOCK();
+    if (findGraph(g)) {
+        GLOBAL_UNLOCK();
+        mvLog(MVLOG_ERROR, "This graph is corrupt or has been destroyed");
+        return NC_INVALID_HANDLE;
+    }
+
+    GLOBAL_UNLOCK();
+    struct _devicePrivate_t *d = (gh->private_data)->dev;
+
+    graphMonCommand_t cmd;
+    cmd.cmdClass = GRAPH_MON_CLASS_GRAPH_CMD;
+    cmd.cmd.graphCmd.type = GRAPH_DEALLOCATE_CMD;
+    cmd.cmd.graphCmd.id = g->id;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->graph_stream_m));
+    if (sendGraphMonitorRequest(d->graph_monitor_stream_id, &cmd)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        return NC_ERROR;
+    }
+    if (checkGraphMonitorResponse(d->graph_monitor_stream_id)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        return NC_ERROR;
+    }
+    XLinkCloseStream(g->graph_stream_id);
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->dev_data_m));
+    if (deallocateGraph(gh->private_data)) {
+        mvLog(MVLOG_ERROR, "This graph has already been destroyed");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_data_m));
+        return NC_INVALID_PARAMETERS;
+    }
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_data_m));
+    free(g);
+    gh->private_data = NULL;
+    free(gh);
+    *graphHandle = NULL;
+    return NC_OK;
+}
+
+static ncStatus_t setGraphOptionClass1(struct _graphPrivate_t *g,
+                                       ncGraphOption_t option,
+                                       const void *data,
+                                       unsigned int dataLength)
+{
+    if (dataLength < sizeof(int)) {
+        mvLog(MVLOG_ERROR, "The dataLength is smaller that required %zu",
+              sizeof(int));
+        return NC_INVALID_DATA_LENGTH;
+    }
+    switch (option) {
+    case NC_RW_GRAPH_EXECUTORS_NUM:
+        if (g->state != NC_GRAPH_CREATED) {
+            mvLog(MVLOG_ERROR, "Can't set NCE number after graph allocation");
+            return NC_UNAUTHORIZED;
+        }
+        g->executors_number = *(int *) data;;
+        break;
+    default:
+        mvLog(MVLOG_ERROR, "There is no such option in class 1");
+        return NC_INVALID_PARAMETERS;
+    }
+    return NC_OK;
+}
+
+static int isGraphPreAllocateOption(int option)
+{
+    switch (option) {
+    case NC_RO_GRAPH_NAME:
+    case NC_RO_GRAPH_STATE:
+    case NC_RW_GRAPH_EXECUTORS_NUM:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+ncStatus_t ncGraphSetOption(struct ncGraphHandle_t * graphHandle,
+                            int option, const void *data,
+                            unsigned int dataLength)
+{
+    CHECK_HANDLE_CORRECT(graphHandle);
+    CHECK_HANDLE_CORRECT_WINFO(graphHandle->private_data, MVLOG_ERROR, "graphHandle has been destroyed");
+    if (!data) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+    if (option < GRAPH_CLASS0_BASE ||
+        option > (GRAPH_CLASS0_BASE + OPTION_CLASS_SIZE * NC_OPTION_CLASS3)) {
+        mvLog(MVLOG_ERROR, "Option %d is invalid", option);
+        return NC_INVALID_PARAMETERS;
+    }
+    if (option >= GRAPH_CLASS0_BASE &&
+        option <= (GRAPH_CLASS0_BASE + OPTION_CLASS_SIZE)) {
+        mvLog(MVLOG_ERROR, "Option %d is read only", option);
+        return NC_UNAUTHORIZED;
+    }
+    struct _graphPrivate_t *g = graphHandle->private_data;
+    GLOBAL_LOCK();
+    if (isGraphPreAllocateOption(option) && g->state != NC_GRAPH_CREATED) {
+        mvLog(MVLOG_ERROR,
+              "This graph has already been alocated - cannot set option");
+        GLOBAL_UNLOCK();
+        return NC_UNAUTHORIZED;
+    }
+    if (!isGraphPreAllocateOption(option) && g->state == NC_GRAPH_CREATED) {
+        mvLog(MVLOG_ERROR,
+              "This graph hasn't been allocated - cannot set option");
+        GLOBAL_UNLOCK();
+        return NC_UNAUTHORIZED;
+    }
+    if (!isGraphPreAllocateOption(option) && findGraph(g)) {
+        mvLog(MVLOG_ERROR, "This graph is corrupt or has been destroyed");
+        GLOBAL_UNLOCK();
+        return NC_INVALID_HANDLE;
+    }
+    GLOBAL_UNLOCK();
+    //we check what we can at this point, later we might fail if
+    //user set a class that was not permitted
+    ncOptionClass_t opClass = getOptionClass(option, GRAPH_CLASS0_BASE);
+    if (g->dev != NULL && opClass > g->dev->dev_attr.max_graph_opt_class) {
+        mvLog(MVLOG_ERROR, "This device FW does not support NC_OPTION_CLASS%d",
+              opClass);
+        return NC_UNAUTHORIZED;
+    }
+    ncStatus_t rc;
+    switch (opClass) {
+    case NC_OPTION_CLASS0:
+        mvLog(MVLOG_ERROR, "Class 0 options are read-only");
+        rc = NC_UNAUTHORIZED; // option class 0 consists of read-only value
+        break;
+    case NC_OPTION_CLASS1:
+        rc = setGraphOptionClass1(g, option, data, dataLength);
+        break;
+    default:
+        mvLog(MVLOG_ERROR, "There is no such option class");
+        rc = NC_INVALID_PARAMETERS;
+        break;
+    }
+    return rc;
+}
+
+static ncStatus_t getGraphOptionClass0(struct _graphPrivate_t *g,
+                                       ncGraphOption_t option,
+                                       void *data, unsigned int *dataLength)
+{
+    if ((option == NC_RO_GRAPH_STATE ||
+         option == NC_RO_GRAPH_INPUT_COUNT ||
+         option == NC_RO_GRAPH_OUTPUT_COUNT ||
+         option == NC_RO_GRAPH_OPTION_CLASS_LIMIT ||
+         option == NC_RW_GRAPH_EXECUTORS_NUM) && *dataLength < sizeof(int)) {
+        mvLog(MVLOG_ERROR,
+              "data length of data (%d) is smaller that required (%zu)!\n",
+              *dataLength, sizeof(int));
+        *dataLength = sizeof(int);
+        return NC_INVALID_DATA_LENGTH;
+    }
+
+    graphMonCommand_t cmd;
+    streamPacketDesc_t* pack = 0;
+    cmd.cmdClass = GRAPH_MON_CLASS_GET_CLASS0;
+
+    switch (option) {
+    case NC_RO_GRAPH_STATE:
+        if (g->state == NC_GRAPH_CREATED ||
+            (g->state == NC_GRAPH_ALLOCATED && !g->started)) {
+            *(int *) data = g->state;
+        } else {
+            CHECK_HANDLE_CORRECT(g->dev);
+            //it has been started we must read from graph
+            cmd.cmd.optionCmd.type.c0 = CLASS0_STATE;
+            cmd.cmd.optionCmd.id = g->id;
+            CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&g->dev->graph_stream_m), NC_ERROR);
+            if (XLinkWriteData(g->dev->graph_monitor_stream_id,
+                               (const uint8_t *) &cmd, sizeof(cmd)) != 0) {
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                return NC_ERROR;
+            }
+
+            if (XLinkReadData(g->dev->graph_monitor_stream_id, &pack) || !pack) {
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                return NC_ERROR;
+            }
+
+            if (pack->length != sizeof(graphState_t)) {
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                XLinkReleaseData(g->dev->graph_monitor_stream_id);
+                return NC_ERROR;
+            }
+            int state = *(int *) pack->data;
+            XLinkReleaseData(g->dev->graph_monitor_stream_id);
+            if (checkGraphMonitorResponse(g->dev->graph_monitor_stream_id)) {
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                return NC_ERROR;
+            }
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            if (state == GRAPH_RUNNING)
+                g->state = NC_GRAPH_RUNNING;
+            else
+                g->state = NC_GRAPH_WAITING_FOR_BUFFERS;
+            *(int *) data = g->state;
+        }
+        *dataLength = sizeof(ncGraphState_t);
+        break;
+    case NC_RO_GRAPH_INPUT_COUNT:
+        *(int *) data = g->input_count;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_GRAPH_OUTPUT_COUNT:
+        *(int *) data = g->output_count;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_GRAPH_TIME_TAKEN_ARRAY_SIZE:
+        *(int *) data = sizeof(float) * g->timingsCount;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_GRAPH_TIME_TAKEN:
+        CHECK_HANDLE_CORRECT(g->dev);
+        if (*dataLength < sizeof(float) * g->timingsCount) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%zu)!\n",
+                  *dataLength, sizeof(float) * g->timingsCount);
+            *dataLength = sizeof(float) * g->timingsCount;
+            return NC_INVALID_DATA_LENGTH;
+        }
+        cmd.cmd.optionCmd.id = g->id;
+        cmd.cmd.optionCmd.type.c0 = CLASS0_TIMING_DATA;
+        CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&g->dev->graph_stream_m), NC_ERROR);
+        if (sendGraphMonitorRequest(g->dev->graph_monitor_stream_id, &cmd)) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+        if (XLinkReadData(g->dev->graph_monitor_stream_id, &pack) || !pack) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+        if (pack->length != sizeof(float) * g->timingsCount) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            XLinkReleaseData(g->dev->graph_monitor_stream_id);
+            return NC_ERROR;
+        }
+        //Need to copy data before we check the response, since checkGraphMonitorResponse
+        //calls releaseData
+        memcpy((float *) data, pack->data, pack->length);
+        XLinkReleaseData(g->dev->graph_monitor_stream_id);
+
+        if (checkGraphMonitorResponse(g->dev->graph_monitor_stream_id)) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        *dataLength = sizeof(float) * g->timingsCount;
+        break;
+    case NC_RO_GRAPH_DEBUG_INFO:
+        CHECK_HANDLE_CORRECT(g->dev);
+        if (*dataLength < NC_DEBUG_BUFFER_SIZE) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%d)!\n",
+                  *dataLength, NC_DEBUG_BUFFER_SIZE);
+            *dataLength = NC_DEBUG_BUFFER_SIZE;
+            return NC_INVALID_DATA_LENGTH;
+        }
+
+        cmd.cmd.optionCmd.type.c0 = CLASS0_DEBUG_DATA;
+        cmd.cmd.optionCmd.id = g->id;
+        CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&g->dev->graph_stream_m), NC_ERROR);
+        if (XLinkWriteData(g->dev->graph_monitor_stream_id, (const uint8_t *) &cmd,
+             sizeof(cmd)) != 0) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+
+        if (XLinkReadData(g->dev->graph_monitor_stream_id, &pack) || !pack) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+
+        if (pack->length != NC_DEBUG_BUFFER_SIZE) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            XLinkReleaseData(g->dev->graph_monitor_stream_id);
+            return NC_ERROR;
+        }
+
+        memcpy((char *) data, pack->data, pack->length);
+        XLinkReleaseData(g->dev->graph_monitor_stream_id);
+        if (checkGraphMonitorResponse(g->dev->graph_monitor_stream_id)) {
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+            return NC_ERROR;
+        }
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+
+        *dataLength = NC_DEBUG_BUFFER_SIZE;
+        break;
+    case NC_RO_GRAPH_INPUT_TENSOR_DESCRIPTORS:{
+            unsigned int size =
+                sizeof(struct ncTensorDescriptor_t) * g->input_count;
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            memcpy((struct ncTensorDescriptor_t *) data, &g->input_tensor_desc,
+                   size);
+            *dataLength = size;
+            break;
+        }
+    case NC_RO_GRAPH_OUTPUT_TENSOR_DESCRIPTORS:{
+            unsigned int size =
+                sizeof(struct ncTensorDescriptor_t) * g->output_count;
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            memcpy((struct ncTensorDescriptor_t *) data, &g->output_tensor_desc,
+                   size);
+            *dataLength = size;
+            break;
+        }
+    case NC_RO_GRAPH_NAME:
+        if (*dataLength < strlen(g->name) + 1) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%zu)!\n",
+                  *dataLength, strlen(g->name) + 1);
+            *dataLength = strlen(g->name) + 1;
+            return NC_INVALID_DATA_LENGTH;
+        }
+        *dataLength = strlen(g->name) + 1;
+        strncpy((char *) data, g->name, *dataLength);
+        break;
+    case NC_RO_GRAPH_OPTION_CLASS_LIMIT:
+        CHECK_HANDLE_CORRECT(g->dev);
+        *(int *) data = g->dev->dev_attr.max_graph_opt_class;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_GRAPH_VERSION:{
+            unsigned int size = sizeof(g->blob_version);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            memcpy((int *) data, g->blob_version, size);
+            *dataLength = size;
+            break;
+        }
+    default:
+        mvLog(MVLOG_ERROR, "There is no such option in class 0");
+        return NC_INVALID_PARAMETERS;
+    }
+    return NC_OK;
+}
+
+static ncStatus_t getGraphOptionClass1(struct _graphPrivate_t *g,
+                                       ncGraphOption_t option,
+                                       void *data, unsigned int *dataLength)
+{
+    switch (option) {
+    case NC_RW_GRAPH_EXECUTORS_NUM:{
+            int size = sizeof(int);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of data (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            *(int *) data = g->executors_number;
+            *dataLength = size;
+            break;
+        }
+    default:
+        mvLog(MVLOG_ERROR, "There is no such option in class 1");
+        return NC_INVALID_PARAMETERS;
+    }
+    return NC_OK;
+}
+
+ncStatus_t ncGraphGetOption(struct ncGraphHandle_t * graphHandle,
+                            int option, void *data, unsigned int *dataLength)
+{
+    CHECK_HANDLE_CORRECT(graphHandle);
+    CHECK_HANDLE_CORRECT_WINFO(graphHandle->private_data, MVLOG_ERROR, "graphHandle has been destroyed");
+
+    if (!dataLength || (*dataLength != 0 && !data)) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    if (option < GRAPH_CLASS0_BASE ||
+        option > (GRAPH_CLASS0_BASE + OPTION_CLASS_SIZE * NC_OPTION_CLASS3)) {
+        mvLog(MVLOG_ERROR, "Option %d is invalid", option);
+        return NC_INVALID_PARAMETERS;
+    }
+
+    struct _graphPrivate_t *g = graphHandle->private_data;
+    CHECK_HANDLE_CORRECT(g);
+
+    GLOBAL_LOCK();
+    if (!isGraphPreAllocateOption(option) && g->state == NC_GRAPH_CREATED) {
+        mvLog(MVLOG_ERROR, "This graph hasn't been allocated");
+        GLOBAL_UNLOCK();
+        return NC_NOT_ALLOCATED;
+    }
+    ncOptionClass_t class = getOptionClass(option, GRAPH_CLASS0_BASE);
+    if (g->dev != NULL && class > g->dev->dev_attr.max_graph_opt_class) {
+        mvLog(MVLOG_ERROR, "This device FW does not support NC_OPTION_CLASS%d",
+              class);
+        return NC_UNAUTHORIZED;
+    }
+    GLOBAL_UNLOCK();
+    ncStatus_t rc;
+    switch (class) {
+    case NC_OPTION_CLASS0:
+        rc = getGraphOptionClass0(g, option, data, dataLength);
+        break;
+    case NC_OPTION_CLASS1:
+        rc = getGraphOptionClass1(g, option, data, dataLength);
+        break;
+    default:
+        mvLog(MVLOG_ERROR, "There is no such option class");
+        rc = NC_INVALID_PARAMETERS;
+        break;
+    }
+    return rc;
+}
+
+ncStatus_t ncGraphAllocateWithFifos(struct ncDeviceHandle_t * deviceHandle,
+                                    struct ncGraphHandle_t * graphHandle,
+                                    const void *graphBuffer,
+                                    unsigned int graphBufferLength,
+                                    const void *graphHeader,
+                                    unsigned int graphHeaderLength,
+                                    struct ncFifoHandle_t ** inFifoHandle,
+                                    struct ncFifoHandle_t ** outFifoHandle)
+{
+    return ncGraphAllocateWithFifosEx(deviceHandle,
+                                      graphHandle, graphBuffer,
+                                      graphBufferLength,
+                                      graphHeader,
+                                      graphHeaderLength,
+                                      inFifoHandle,
+                                      NC_FIFO_HOST_WO, 2, NC_FIFO_FP32,
+                                      outFifoHandle, NC_FIFO_HOST_RO, 2,
+                                      NC_FIFO_FP32);
+}
+
+ncStatus_t ncGraphAllocateWithFifosEx(struct ncDeviceHandle_t * deviceHandle,
+                                      struct ncGraphHandle_t * graphHandle,
+                                      const void *graphBuffer,
+                                      unsigned int graphBufferLength,
+                                      const void *graphHeader,
+                                      unsigned int graphHeaderLength,
+                                      struct ncFifoHandle_t ** inFifoHandle,
+                                      ncFifoType_t inFifoType, unsigned int inNumElem,
+                                      ncFifoDataType_t inDataType,
+                                      struct ncFifoHandle_t ** outFifoHandle,
+                                      ncFifoType_t outFifoType, unsigned int outNumElem,
+                                      ncFifoDataType_t outDataType)
+{
+    CHECK_HANDLE_CORRECT(deviceHandle);
+    CHECK_HANDLE_CORRECT(graphHandle);
+    CHECK_HANDLE_CORRECT(graphBuffer);
+    CHECK_HANDLE_CORRECT(graphHeader);
+    CHECK_HANDLE_CORRECT(inFifoHandle);
+    CHECK_HANDLE_CORRECT(outFifoHandle);
+    if ( !inNumElem || !outNumElem ) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL or Zero!");
+        return NC_INVALID_PARAMETERS;
+    }
+    ncStatus_t rc = ncGraphAllocate(deviceHandle, graphHandle, graphBuffer, graphBufferLength, graphHeader, graphHeaderLength);
+    if (rc != NC_OK)
+        return rc;
+
+    if (inFifoType == NC_FIFO_HOST_RO) {
+        mvLog(MVLOG_ERROR, "input fifo cannot be read-only");
+        return NC_INVALID_PARAMETERS;
+    }
+    if (outFifoType == NC_FIFO_HOST_WO) {
+        mvLog(MVLOG_ERROR, "output fifo cannot be write-only");
+        return NC_INVALID_PARAMETERS;
+    }
+    // Read tensor descriptors
+    struct ncTensorDescriptor_t inputTensorDesc;
+    struct ncTensorDescriptor_t outputTensorDesc;
+    unsigned int length = sizeof(struct ncTensorDescriptor_t);
+    rc = ncGraphGetOption(graphHandle,
+                          NC_RO_GRAPH_INPUT_TENSOR_DESCRIPTORS,
+                          &inputTensorDesc, &length);
+    if (rc != NC_OK) {
+        return rc;
+    }
+    rc = ncGraphGetOption(graphHandle,
+                          NC_RO_GRAPH_OUTPUT_TENSOR_DESCRIPTORS,
+                          &outputTensorDesc, &length);
+    if (rc != NC_OK) {
+        return rc;
+    }
+    rc = ncFifoCreate("fifoIn0", inFifoType, inFifoHandle);
+    if (rc != NC_OK) {
+        return rc;
+    }
+    rc = ncFifoSetOption(*inFifoHandle, NC_RW_FIFO_DATA_TYPE, &inDataType,
+                         sizeof(inDataType));
+    if (rc != NC_OK) {
+        return rc;
+    }
+    rc = ncFifoAllocate(*inFifoHandle, deviceHandle, &inputTensorDesc,
+                        inNumElem);
+    if (rc != NC_OK) {
+        return rc;
+    }
+    rc = ncFifoCreate("fifoOut0", outFifoType, outFifoHandle);
+    if (rc != NC_OK) {
+        ncFifoDestroy(inFifoHandle);
+        return rc;
+    }
+    rc = ncFifoSetOption(*outFifoHandle, NC_RW_FIFO_DATA_TYPE, &outDataType,
+                         sizeof(outDataType));
+    if (rc != NC_OK) {
+        ncFifoDestroy(inFifoHandle);
+        ncFifoDestroy(outFifoHandle);
+        return rc;
+    }
+    rc = ncFifoAllocate(*outFifoHandle, deviceHandle, &outputTensorDesc,
+                        outNumElem);
+    if (rc != NC_OK) {
+        ncFifoDestroy(inFifoHandle);
+        ncFifoDestroy(outFifoHandle);
+        return rc;
+    }
+    return rc;
+}
+
+ncStatus_t ncGlobalSetOption(ncGlobalOption_t option, const void *data,
+                             unsigned int dataLength)
+{
+    if (!data) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+    switch (option) {
+        case NC_RW_LOG_LEVEL:
+        case NC_RW_RESET_ALL:
+        case NC_RW_COMMON_TIMEOUT_MSEC:
+        case NC_RW_DEVICE_OPEN_TIMEOUT_MSEC:
+        case NC_RW_ALLOC_GRAPH_TIMEOUT_MSEC: {
+            if (dataLength < sizeof(int)) {
+                mvLog(MVLOG_ERROR, "The dataLength is smaller that required %zu",
+                      sizeof(int));
+                return NC_INVALID_PARAMETERS;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+
+    switch (option) {
+    case NC_RW_LOG_LEVEL:
+        {
+            mvLog_t log_level = *(mvLog_t *) data;
+            if (log_level >= MVLOG_LAST || log_level < 0) {
+                mvLog(MVLOG_ERROR, "log_level value is invalid %d\n",
+                      log_level);
+                return NC_INVALID_PARAMETERS;
+            }
+            mvLogLevelSet(*(mvLog_t *) data);
+            mvLogDefaultLevelSet(*(mvLog_t *) data);    //Allow turning off warnings and errors
+        }
+        break;
+    case NC_RO_API_VERSION:
+        mvLog(MVLOG_ERROR, "API version is read-only");
+        return NC_UNAUTHORIZED;
+    case NC_RW_RESET_ALL:
+        if (!initialized)
+            reset_all = *(int*)data;
+        break;
+    case NC_RW_COMMON_TIMEOUT_MSEC: {
+        int gTimeout = *(int *) data;
+        XLinkError_t rc = XLinkSetCommonTimeOutMsec(gTimeout);
+        if (rc) {
+            mvLog(MVLOG_ERROR, "Set global common timeout failed, rc = %s\n", XLinkErrorToStr(rc));
+            return NC_INVALID_PARAMETERS;
+        }
+        break;
+    }
+    case NC_RW_DEVICE_OPEN_TIMEOUT_MSEC: {
+        int gTimeout = *(int *) data;
+        XLinkError_t rc = XLinkSetDeviceOpenTimeOutMsec(gTimeout);
+        if (rc) {
+            mvLog(MVLOG_ERROR, "Set global open device timeout failed, rc = %s\n", XLinkErrorToStr(rc));
+            return NC_INVALID_PARAMETERS;
+        }
+        break;
+    }
+    case NC_RW_ALLOC_GRAPH_TIMEOUT_MSEC: {
+        int gTimeout = *(int *) data;
+        XLinkError_t rc = XLinkSetAllocateGraphTimeOutMsec(gTimeout);
+        if (rc) {
+            mvLog(MVLOG_ERROR, "Set global allocate graph timeout failed, rc = %s\n", XLinkErrorToStr(rc));
+            return NC_INVALID_PARAMETERS;
+        }
+        break;
+    }
+    default:
+        mvLog(MVLOG_ERROR, "No such option");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    return NC_OK;
+}
+
+ncStatus_t ncGlobalGetOption(ncGlobalOption_t option, void *data, unsigned int *dataLength)
+{
+    if (!data || !dataLength) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+    switch (option) {
+    case NC_RW_LOG_LEVEL:
+        *(int *) data = mvLogLevel_ncAPI;
+        *dataLength = sizeof(mvLogLevel_ncAPI);
+        break;
+    case NC_RO_API_VERSION:
+        return NC_UNSUPPORTED_FEATURE;
+        break;
+    case NC_RW_RESET_ALL:
+        *(int*)data = reset_all;
+        *dataLength = sizeof(reset_all);
+        break;
+    default:
+        mvLog(MVLOG_ERROR, "No such option");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    return NC_OK;
+}
+
+static ncStatus_t getDeviceOptionClass0(struct _devicePrivate_t *d,
+                                        ncDeviceOption_t option,
+                                        void *data, unsigned int *dataLength)
+{
+    ncStatus_t rc = NC_OK;
+
+    switch (option) {
+    case NC_RO_DEVICE_THERMAL_STATS:
+        if (*dataLength < NC_THERMAL_BUFFER_SIZE) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%d)!\n",
+                  *dataLength, NC_THERMAL_BUFFER_SIZE);
+            *dataLength = NC_THERMAL_BUFFER_SIZE;
+            return NC_INVALID_DATA_LENGTH;
+        }
+        rc = getThermalStats(d);
+        if (rc) {
+            return rc;
+        }
+        memcpy((float *) data, &d->thermal_stats[1], NC_THERMAL_BUFFER_SIZE);
+        *dataLength = NC_THERMAL_BUFFER_SIZE;
+        break;
+    case NC_RO_DEVICE_THERMAL_THROTTLING_LEVEL:
+        rc = getThermalStats(d);
+        if (rc) {
+            return rc;
+        }
+        d->throttle_happened = d->thermal_stats[0];
+        *(int *) data = d->throttle_happened;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_STATE:
+        *(int *) data = d->state;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_ALLOCATED_GRAPH_NUM:
+        *(int *) data = deviceGetNumberOfGraphs(d);
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_ALLOCATED_FIFO_NUM:
+        *(int *) data = deviceGetNumberOfFifos(d);
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_MEMORY_SIZE:
+        *(int *) data = d->dev_attr.max_memory;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_MAX_FIFO_NUM:
+        *(int *) data = d->dev_attr.max_fifos;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_MAX_GRAPH_NUM:
+        *(int *) data = d->dev_attr.max_graphs;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_OPTION_CLASS_LIMIT:
+        *(int *) data = d->dev_attr.max_device_opt_class;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_NAME:
+        if (*dataLength < strlen(d->dev_addr) + 1) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%zu)!\n",
+                  *dataLength, strlen(d->dev_addr) + 1);
+            *dataLength = strlen(d->dev_addr) + 1;
+            return NC_INVALID_DATA_LENGTH;
+        }
+        *dataLength = strlen(d->dev_addr) + 1;
+        strncpy((char *) data, d->dev_addr, *dataLength);
+        break;
+    case NC_RO_DEVICE_PLATFORM:
+        if (d->dev_attr.fw_version[1] == 0x2480){
+            *(ncDevicePlatform_t *) data = MYRIAD_X;
+        } else if (d->dev_attr.fw_version[1] == 0x2450) {
+            *(ncDevicePlatform_t *) data = MYRIAD_2;
+        } else {
+            *(ncDevicePlatform_t *) data = UNKNOWN_PLATFORM;
+        }
+        *dataLength = sizeof(ncDevicePlatform_t);
+        break;
+    case NC_RO_DEVICE_FW_VERSION:
+        *(unsigned int **) data = d->dev_attr.fw_version;
+        *dataLength = sizeof(unsigned int*);
+        break;
+    case NC_RO_DEVICE_CURRENT_MEMORY_USED:{
+            uint32_t mem;
+            if (deviceGetDeviceMemory(d, &mem)) {
+                rc = NC_ERROR;
+                break;
+            }
+            *(int *) data = mem;
+            *dataLength = sizeof(int);
+            break;
+        }
+    case NC_RO_DEVICE_MAX_EXECUTORS_NUM:
+        *(int *) data = d->dev_attr.max_executors;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_DEVICE_DEBUG_INFO:
+        return NC_UNSUPPORTED_FEATURE;
+    default:
+        mvLog(MVLOG_ERROR, "No such option");
+        return NC_INVALID_PARAMETERS;
+    }
+    return rc;
+}
+
+ncStatus_t ncDeviceSetOption(struct ncDeviceHandle_t *deviceHandle,
+                             ncDeviceOption_t option,
+                             const void *data, unsigned int dataLength){
+    if (!deviceHandle || !data){
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+    if (dataLength != sizeof(int) && dataLength != sizeof(void*)){
+        mvLog(MVLOG_ERROR, "The dataLength must be %zu or %zu", sizeof(int), sizeof(void*));
+        return NC_INVALID_PARAMETERS;
+    }
+
+    if (option < DEVICE_CLASS0_BASE ||
+        option > (DEVICE_CLASS0_BASE + OPTION_CLASS_SIZE * NC_OPTION_CLASS3)) {
+        mvLog(MVLOG_ERROR, "Option %d is invalid", option);
+        return NC_INVALID_PARAMETERS;
+    }
+
+
+    ncOptionClass_t opClass = getOptionClass(option, DEVICE_CLASS0_BASE);
+    if (opClass < NC_OPTION_CLASS1) {
+        mvLog(MVLOG_ERROR, "Class 0 options are read-only");
+        return NC_UNAUTHORIZED;
+    }
+    struct _devicePrivate_t *d = deviceHandle->private_data;
+    GLOBAL_LOCK();
+
+    if (findDevice(d)) {
+        mvLog(MVLOG_ERROR,
+              "This device handle is corrupt or has been destroyed");
+        GLOBAL_UNLOCK();
+
+        return NC_INVALID_HANDLE;
+    }
+    GLOBAL_UNLOCK();
+    if (opClass > d->dev_attr.max_device_opt_class) {
+        mvLog(MVLOG_ERROR, "This device FW does not support NC_OPTION_CLASS%d",
+              opClass);
+        return NC_UNAUTHORIZED;
+    }
+
+    return NC_INVALID_PARAMETERS;
+}
+
+//static options can be read before device is open
+static int isDeviceStaticOption(int option)
+{
+    switch (option) {
+    case NC_RO_DEVICE_NAME:
+    case NC_RO_DEVICE_STATE:
+    case NC_RO_DEVICE_HW_VERSION:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+ncStatus_t ncDeviceGetOption(struct ncDeviceHandle_t * deviceHandle,
+        ncDeviceOption_t option, void *data, unsigned int *dataLength)
+{
+    CHECK_HANDLE_CORRECT(deviceHandle);
+    ncStatus_t rc;
+
+    if (!dataLength || (*dataLength != 0 && !data)) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    if (option < DEVICE_CLASS0_BASE ||
+        option > (DEVICE_CLASS0_BASE + OPTION_CLASS_SIZE * NC_OPTION_CLASS3)) {
+        mvLog(MVLOG_ERROR, "Option %d is invalid", option);
+        return NC_INVALID_PARAMETERS;
+    }
+
+    struct _devicePrivate_t *d = deviceHandle->private_data;
+
+    GLOBAL_LOCK();
+    if (!isDeviceStaticOption(option) && d->state != NC_DEVICE_OPENED) {
+        mvLog(MVLOG_ERROR, "This device hasn't been openned");
+        GLOBAL_UNLOCK();
+        return NC_UNAUTHORIZED;
+    }
+
+    ncOptionClass_t opClass = getOptionClass(option, DEVICE_CLASS0_BASE);
+    if (!isDeviceStaticOption(option)) {
+        if (findDevice(d)) {
+            mvLog(MVLOG_ERROR,
+                  "This device handle is corrupt or has been destroyed");
+            GLOBAL_UNLOCK();
+            return NC_INVALID_HANDLE;
+        }
+
+        if (d->dev_attr.max_device_opt_class < opClass) {
+            mvLog(MVLOG_ERROR,
+                  "This device FW does not support NC_OPTION_CLASS%d", opClass);
+            GLOBAL_UNLOCK();
+            return NC_UNAUTHORIZED;
+        }
+    }
+
+    switch (opClass) {
+    case NC_OPTION_CLASS0:
+        rc = getDeviceOptionClass0(d, option, data, dataLength);
+        break;
+    default:
+        rc = NC_INVALID_PARAMETERS;
+        break;
+    }
+
+    GLOBAL_UNLOCK();
+    return rc;
+}
+
+static int fifoWriteAccess(struct _fifoPrivate_t *fifoHandle)
+{
+    if (fifoHandle->type == NC_FIFO_HOST_WO) {
+        return 1;
+    }
+    return 0;
+}
+
+static int fifoReadAccess(struct _fifoPrivate_t *fifoHandle)
+{
+    if (fifoHandle->type == NC_FIFO_HOST_RO) {
+        return 1;
+    }
+    return 0;
+}
+
+ncStatus_t ncFifoCreate(const char *name, ncFifoType_t type,
+                        struct ncFifoHandle_t ** fifoHandle)
+{
+    mvLog(MVLOG_INFO, "Init fifo");
+    CHECK_HANDLE_CORRECT(fifoHandle);
+    CHECK_HANDLE_CORRECT(name);
+
+    if (type != NC_FIFO_HOST_RO && type != NC_FIFO_HOST_WO) {
+        mvLog(MVLOG_ERROR, "Fifo typo not supported!");
+        return NC_UNSUPPORTED_FEATURE;
+    }
+
+    static int fifoIdCounter = 0;
+    *fifoHandle = (struct ncFifoHandle_t *) malloc(sizeof(struct ncFifoHandle_t));
+    if (!(*fifoHandle)) {
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        return NC_OUT_OF_MEMORY;
+    }
+
+    struct _fifoPrivate_t *handle = (struct _fifoPrivate_t *) malloc(sizeof(struct _fifoPrivate_t));
+    (*fifoHandle)->private_data = handle;
+    if (!handle) {
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        return NC_OUT_OF_MEMORY;
+    }
+
+    handle->type = type;
+    handle->consumer_cnt = 1;   //default consumers
+
+    handle->state = NC_FIFO_CREATED;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_init(&handle->fifo_mutex, NULL));
+    handle->consumed_by_graph = 0;
+    handle->write_count = 0;
+    handle->user_param_in = NULL;
+    handle->user_param_out = NULL;
+    handle->api_read_element = 0;
+    handle->id = fifoIdCounter++;
+    handle->num_elements = 0;
+    handle->host_tensor_desc_set = 0;
+    memset(&handle->host_tensor_desc, 0, sizeof(struct ncTensorDescriptor_t));
+    handle->host_tensor_desc.dataType = NC_FIFO_FP16; //default app data type is FP16
+    strncpy(handle->name, name, NC_MAX_NAME_SIZE - 1);
+    handle->name[NC_MAX_NAME_SIZE - 1] = 0;
+
+    return NC_OK;
+}
+
+int pushUserParam(struct _fifoPrivate_t *fH, void *user_param, int isIn)
+{
+    struct _userParamPrivate_t *new_user_param =
+        calloc(1, sizeof(struct _userParamPrivate_t));
+    if (!new_user_param) {
+        mvLog(MVLOG_ERROR, "Memory allocation failed");
+        return NC_OUT_OF_MEMORY;
+    }
+    new_user_param->next = NULL;
+    new_user_param->data = user_param;
+    if (isIn) {
+        new_user_param->next = fH->user_param_in;
+        fH->user_param_in = new_user_param;
+    } else {
+        new_user_param->next = fH->user_param_out;
+        fH->user_param_out = new_user_param;
+    }
+    return NC_OK;
+}
+int popUserParam(struct _fifoPrivate_t* fH, void** user_param, int isIn)
+{
+    struct _userParamPrivate_t* prev = NULL;
+    struct _userParamPrivate_t* curr = NULL;
+    if (isIn)
+        curr = fH->user_param_in;
+    else
+        curr = fH->user_param_out;
+
+    if (curr == NULL) {
+        *user_param = NULL;
+        mvLog(MVLOG_ERROR, "Trying to read user param from an empty queue!");
+        return NC_ERROR;
+    }
+
+    while (curr->next != NULL)
+    {
+        prev = curr;
+        curr = curr->next;
+    }
+
+    *user_param = curr->data;
+
+    if (prev)
+        prev->next = NULL;
+    else {
+        if (isIn)
+            fH->user_param_in = NULL;
+        else
+            fH->user_param_out = NULL;
+    }
+    free(curr);
+    curr = NULL;
+    return NC_OK;
+}
+
+void getStrides(ncFifoLayout_t layout, struct ncTensorDescriptor_t* desc,
+    ncFifoDataType_t dataType) {
+    int baseStride = dataType == NC_FIFO_FP16 ? FP16_DATA_SIZE : sizeof(float);
+    switch (layout) {
+        case NC_FIFO_HWC:
+            desc->cStride = baseStride;
+            desc->wStride = desc->cStride * desc->c;
+            desc->hStride = desc->wStride * desc->w;
+            break;
+        case NC_FIFO_CHW:
+            desc->wStride = baseStride;
+            desc->hStride = desc->wStride * desc->w;
+            desc->cStride = desc->hStride * desc->h;
+            break;
+        case NC_FIFO_HCW:
+            desc->wStride = baseStride;
+            desc->cStride = desc->wStride * desc->w;
+            desc->hStride = desc->cStride * desc->c;
+            break;
+        case NC_FIFO_CWH:
+            desc->hStride = baseStride;
+            desc->wStride = desc->hStride * desc->h;
+            desc->cStride = desc->wStride * desc->w;
+            break;
+        case NC_FIFO_WCH:
+            desc->hStride = baseStride;
+            desc->cStride = desc->hStride * desc->h;
+            desc->wStride = desc->cStride * desc->c;
+            break;
+        case NC_FIFO_WHC:
+            desc->cStride = baseStride;
+            desc->hStride = desc->cStride * desc->c;
+            desc->wStride = desc->hStride * desc->h;
+            break;
+        default:
+            break;
+    }
+}
+
+static unsigned int getTotalSize(struct ncTensorDescriptor_t* desc) {
+    unsigned int maxStride;
+    unsigned int maxDim;
+
+    if (desc->wStride == desc->hStride &&
+        desc->wStride == desc->cStride) {
+        maxDim = MAX(desc->w, desc->h);
+        maxDim = MAX(maxDim, desc->c);
+        maxStride = desc->wStride;
+    } else if (desc->wStride >= desc->hStride &&
+               desc->wStride >= desc->cStride) {
+        maxStride = desc->wStride;
+        maxDim = desc->w;
+        if (desc->wStride == desc->hStride)
+            maxDim = MAX(desc->w, desc->h);
+        else if (desc->wStride == desc->cStride)
+            maxDim = MAX(desc->w, desc->c);
+    } else if (desc->hStride >= desc->wStride &&
+               desc->hStride >= desc->cStride) {
+        maxStride = desc->hStride;
+        maxDim = desc->h;
+        if (desc->hStride == desc->wStride)
+            maxDim = MAX(desc->h, desc->w);
+        else if (desc->hStride == desc->cStride)
+            maxDim = MAX(desc->h, desc->c);
+    } else {
+        maxStride = desc->cStride;
+        maxDim = desc->c;
+        if (desc->cStride == desc->wStride)
+            maxDim = MAX(desc->c, desc->w);
+        else if (desc->cStride == desc->hStride)
+            maxDim = MAX(desc->c, desc->h);
+    }
+    return desc->n * maxStride * maxDim;
+}
+static unsigned int getElementSize(struct _fifoPrivate_t * handle) {
+    return handle->host_tensor_desc.totalSize;
+}
+
+ncStatus_t ncFifoAllocate(struct ncFifoHandle_t * fifoHandle,
+                          struct ncDeviceHandle_t * device,
+                          struct ncTensorDescriptor_t * tensor_desc,
+                          unsigned int numElem)
+{
+    mvLog(MVLOG_INFO, "Creating fifo");
+    CHECK_HANDLE_CORRECT(fifoHandle);
+    CHECK_HANDLE_CORRECT(device);
+
+    if (!tensor_desc || !numElem) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+    if (tensor_desc->n * tensor_desc->c * tensor_desc->w * tensor_desc->h == 0
+        || !tensor_desc->totalSize) {
+        mvLog(MVLOG_ERROR,
+              "Tensor descriptor is invalid. Total size 0 or other element is zero");
+        return NC_INVALID_PARAMETERS;
+    }
+    struct _fifoPrivate_t *handle = fifoHandle->private_data;
+    if (handle->state == NC_FIFO_ALLOCATED) {
+        mvLog(MVLOG_ERROR, "Fifo has already been allocated");
+        return NC_UNAUTHORIZED;
+    }
+    if (handle->state != NC_FIFO_CREATED) {
+        mvLog(MVLOG_ERROR, "Fifo handle is corrupt or has been destroyed");
+        return NC_INVALID_HANDLE;
+    }
+    struct _devicePrivate_t *d = devices;
+    GLOBAL_LOCK();
+    while (d) {
+        if (d == device->private_data)
+            break;
+        d = d->next;
+    }
+    if (!d) {
+        GLOBAL_UNLOCK();
+        mvLog(MVLOG_ERROR, "Device not found!\n");
+        return NC_INVALID_PARAMETERS;
+    }
+    GLOBAL_UNLOCK();
+
+    handle->graph_tensor_desc = *tensor_desc;
+    handle->host_tensor_desc = *tensor_desc;
+    handle->graphLayout = getLayout(tensor_desc);
+    handle->user_param_in = NULL;
+    handle->user_param_out = NULL;
+    handle->num_elements = numElem;
+    handle->consumers_remaining = handle->consumer_cnt; //default consumers
+    handle->dev = d;
+    handle->next = NULL;
+
+    handle->datasize = getElementSize(handle);
+
+    if (d->fifos)
+        handle->next = d->fifos;
+    d->fifos = handle;
+
+    graphMonCommand_t cmd;
+    cmd.cmdClass = GRAPH_MON_CLASS_BUFFER_CMD;
+    cmd.cmd.buffCmd.type = BUFFER_ALLOCATE_CMD;
+    struct tensorDescriptor_t privateDesc;
+    privateDesc.c = tensor_desc->c;
+    privateDesc.n = tensor_desc->n;
+    privateDesc.h = tensor_desc->h;
+    privateDesc.w = tensor_desc->w;
+    // should be removiedd: #-17902
+    privateDesc.totalSize = tensor_desc->totalSize;
+    privateDesc.widthStride = tensor_desc->wStride;
+    privateDesc.heightStride = tensor_desc->hStride;
+    privateDesc.channelsStride = tensor_desc->cStride;
+
+    cmd.cmd.buffCmd.desc  = privateDesc;
+    cmd.cmd.buffCmd.elemCnt = numElem;
+    snprintf(cmd.cmd.buffCmd.name, 16, "FIFO%d", handle->id);
+    cmd.cmd.buffCmd.name[NC_MAX_NAME_SIZE - 1] = 0;
+    cmd.cmd.buffCmd.id = handle->id;
+
+    uint32_t writeSize;
+    if (fifoWriteAccess(handle)) {
+        writeSize = tensor_desc->totalSize * numElem;
+        cmd.cmd.buffCmd.writeChannel = 1;
+    } else {
+        cmd.cmd.buffCmd.writeChannel = 0;
+        writeSize = 8; // no write permission on this buffer, so we shouldn't bother allocating buffer on the device
+    }
+    if (fifoReadAccess(handle)) {
+        cmd.cmd.buffCmd.readChannel = 1;
+    } else {
+        cmd.cmd.buffCmd.readChannel = 0;
+    }
+    streamId_t streamId = XLinkOpenStream(d->xlink->linkId, cmd.cmd.buffCmd.name, writeSize);
+
+    char out_msg[NC_MAX_NAME_SIZE * 2];
+    snprintf(out_msg, NC_MAX_NAME_SIZE * 2, "%s %s", "can't open stream: ", cmd.cmd.buffCmd.name);
+
+    CHECK_STREAM_ID(streamId, {
+            handle->state = NC_FIFO_FAILED;
+            handle->dev->state = NC_DEVICE_FAILED;
+        }, out_msg);
+
+    handle->streamId = streamId;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->graph_stream_m));
+
+    if (sendGraphMonitorRequest(d->graph_monitor_stream_id, &cmd)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        mvLog(MVLOG_ERROR, "can't send command\n");
+        return NC_ERROR;
+    }
+    if (checkGraphMonitorResponse(d->graph_monitor_stream_id)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        mvLog(MVLOG_ERROR, "myriad NACK\n");
+        return NC_ERROR;
+    }
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+
+    handle->state = NC_FIFO_ALLOCATED;
+    return NC_OK;
+
+}
+
+ncStatus_t ncFifoDestroy(struct ncFifoHandle_t ** fifoHandle)
+{
+    CHECK_HANDLE_CORRECT(fifoHandle);
+    struct ncFifoHandle_t *fh = *fifoHandle;
+    if (!fh) {
+        mvLog(MVLOG_INFO, "handle is already destroyed");
+        return NC_OK;
+    }
+
+    struct _fifoPrivate_t *handle = fh->private_data;
+
+    if (handle->state == NC_FIFO_CREATED || handle->state == NC_FIFO_DEALLOCATED) {
+        pthread_mutex_t * fifo_mutex = &fh->private_data->fifo_mutex;
+#if !(defined(_WIN32) || defined(_WIN64))
+        /**
+         * There is no wrapper for pthread_mutex_trylock on windows at the moment.
+         */
+        int error = pthread_mutex_trylock(fifo_mutex);
+        if (error && error != EBUSY) {
+            /**
+             * Calling pthread_mutex_unlock with not locked mutex is undefined behavior.
+             * There is no standard C-API functions for checking whether mutex is locked or not as well as state entry.
+             * After pthread_mutex_trylock mutex can be safely unlocked since it is already locked.
+             * EBUSY error code stands for already locked mutex that is not an error in this case.
+             */
+             mvLog(MVLOG_ERROR, "pthread_mutex_trylock(fifo_mutex) failed with error: %d", error);
+        }
+#endif
+
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(fifo_mutex));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(fifo_mutex));
+
+        free(fh->private_data);
+        fh->private_data = NULL;
+
+        free(fh);
+        *fifoHandle = NULL;
+
+        return NC_OK;
+    }
+    if (!findFifo(handle)) {
+        mvLog(MVLOG_ERROR,
+              "fifo handle seems to be corrupt or has been destroyed");
+        return NC_INVALID_HANDLE;
+    }
+    //clean up fifo
+    /*if (fifoReadAccess(handle)) {
+        int fillLevel;
+        int rc = XLinkGetFillLevel(handle->streamId, 0, &fillLevel);
+        if (rc == X_LINK_SUCCESS) {
+            while (fillLevel && rc == X_LINK_SUCCESS) {
+                rc = XLinkReleaseData(handle->streamId);
+                fillLevel--;
+            }
+        }
+    }*/
+    //First write to the fifo to stop it's thread
+    if (fifoWriteAccess(handle)) {
+        int msg = 0xdead;
+        if (XLinkWriteData(handle->streamId, (uint8_t *) & msg, sizeof(msg)) !=
+            0) {
+            mvLog(MVLOG_ERROR, "Failed to write to fifo before deleting it!");
+            return NC_ERROR;
+        }
+    }
+
+    graphMonCommand_t cmd;
+    cmd.cmdClass = GRAPH_MON_CLASS_BUFFER_CMD;
+    cmd.cmd.buffCmd.type = BUFFER_DEALLOCATE_CMD;
+    cmd.cmd.buffCmd.id = handle->id;
+
+    struct _devicePrivate_t *d = handle->dev;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->graph_stream_m));
+    if (sendGraphMonitorRequest(d->graph_monitor_stream_id, &cmd)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        mvLog(MVLOG_WARN, "can't send command\n");
+        return NC_ERROR;
+    }
+    if (checkGraphMonitorResponse(d->graph_monitor_stream_id)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+        mvLog(MVLOG_WARN, "myriad NACK\n");
+        return NC_ERROR;
+    }
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
+
+    CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->dev_data_m));
+    if (deallocateFifo(handle)) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_data_m));
+        return NC_INVALID_PARAMETERS;
+    }
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->dev_data_m));
+
+    free(fh->private_data);
+    fh->private_data = NULL;
+    free(fh);
+    *fifoHandle = NULL;
+    return NC_OK;
+
+}
+
+ncStatus_t ncFifoWriteElem(struct ncFifoHandle_t * fifoHandle,
+                           const void *inputTensor,
+                           unsigned int * inputTensorLength,
+                           void *userParam)
+{
+    CHECK_HANDLE_CORRECT(fifoHandle);
+
+    if (inputTensorLength == NULL || *inputTensorLength <= 0) {
+        mvLog(MVLOG_ERROR, "inputTensorSize is null or invalid value");
+        return NC_INVALID_PARAMETERS;
+    }
+    struct _fifoPrivate_t *handle = fifoHandle->private_data;
+    if (!findFifo(handle)) {
+        if (!handle) {
+            mvLog(MVLOG_ERROR,
+                  "fifo handle seems to be corrupt or has been destroyed");
+            return NC_INVALID_HANDLE;
+        }
+        if (handle->state == NC_FIFO_CREATED) {
+            mvLog(MVLOG_ERROR, "FIFO is not yet allocated");
+            return NC_NOT_ALLOCATED;
+        }
+        if (handle->state != NC_FIFO_ALLOCATED) {
+            mvLog(MVLOG_ERROR,
+                  "FIFO is not yet allocated or have been destroyed.");
+            return NC_UNAUTHORIZED;
+        }
+    }
+
+    CHECK_HANDLE_CORRECT_RC(inputTensor, NC_INVALID_PARAMETERS);
+
+    if (!fifoWriteAccess(handle)) {
+        mvLog(MVLOG_ERROR, "No write access to fifo");
+        return NC_UNAUTHORIZED;
+    }
+    if (*inputTensorLength != handle->datasize) {
+            mvLog(MVLOG_ERROR,
+                  "input tensor length (%d) doesnt match expected value (%d)",
+                  *inputTensorLength, handle->datasize);
+            *inputTensorLength = handle->datasize;
+            return NC_INVALID_DATA_LENGTH;
+    }
+    struct ncTensorDescriptor_t * inputDesc = &handle->graph_tensor_desc;
+
+    int rc;
+    // Convert fp32 to fp16 and/or input layout
+    ncFifoLayout_t layout = getLayout(inputDesc);
+    ncFifoLayout_t host_layout = getLayout(&handle->host_tensor_desc);
+    if (handle->host_tensor_desc.dataType == NC_FIFO_FP32 || layout != host_layout) {
+        mvLog(MVLOG_ERROR,
+              "This version of mvnc does not support converting layout and precision on the host\n");
+
+        return NC_UNSUPPORTED_FEATURE;
+    } else {
+        rc = XLinkWriteData(handle->streamId, inputTensor, *inputTensorLength);
+    }
+    if (rc != 0)
+        return NC_ERROR;
+
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&handle->fifo_mutex), NC_ERROR);
+    rc = pushUserParam(handle, userParam, 1);
+    if (rc != NC_OK) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&handle->fifo_mutex));
+        return rc;
+    }
+    handle->write_count++;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&handle->fifo_mutex));
+
+    mvLog(MVLOG_DEBUG, "write count %d num_elements %d userparam %p\n",
+          handle->write_count - 1, handle->num_elements, userParam);
+    return NC_OK;
+
+}
+
+ncStatus_t ncFifoReadElem(struct ncFifoHandle_t * fifoHandle, void *outputData,
+                          unsigned int *outputDataLen, void **userParam)
+{
+    if (!fifoHandle) {
+        mvLog(MVLOG_ERROR, "fifo handle is NULL");
+        return NC_INVALID_HANDLE;
+    }
+    if (!outputDataLen || (*outputDataLen != 0 && !outputData)) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    struct _fifoPrivate_t *handle = fifoHandle->private_data;
+    if (!findFifo(handle)) {
+        if (!handle) {
+            mvLog(MVLOG_ERROR,
+                  "fifo handle seems to be corrupt or has been destroyed");
+            return NC_INVALID_HANDLE;
+        }
+        if (handle->state == NC_FIFO_CREATED) {
+            mvLog(MVLOG_ERROR, "FIFO is not yet allocated");
+            return NC_NOT_ALLOCATED;
+        }
+    }
+
+    if (handle->state != NC_FIFO_ALLOCATED) {
+        mvLog(MVLOG_ERROR, "FIFO is not yet allocated or have been destroyed.");
+        return NC_UNAUTHORIZED;
+    }
+
+    if (*outputDataLen < handle->datasize) {
+        mvLog(MVLOG_ERROR,
+              "This datasize in tensorDesc (%d) is smaller than required (%d)!",
+              *outputDataLen, handle->datasize);
+        *outputDataLen = handle->datasize;
+        return NC_INVALID_DATA_LENGTH;
+    }
+
+    if (!fifoReadAccess(handle)) {
+        mvLog(MVLOG_ERROR, "FIFO has no read access");
+        return NC_UNAUTHORIZED;
+    }
+    if (handle->api_read_element != 0) {
+        mvLog(MVLOG_ERROR, "API already read this element");
+        return NC_UNAUTHORIZED;
+    }
+    streamPacketDesc_t *packet = 0;
+    if (!XLinkReadData(handle->streamId, &packet) && packet) {
+        // Convert fp16 to fp32 and/or layout
+        struct ncTensorDescriptor_t * fifoDesc = &handle->graph_tensor_desc;
+        ncFifoLayout_t layout = getLayout(fifoDesc);
+        ncFifoLayout_t host_layout = getLayout(&handle->host_tensor_desc);
+
+        if (handle->host_tensor_desc.dataType == NC_FIFO_FP32 ||
+            layout != host_layout) {
+            mvLog(MVLOG_ERROR,
+                  "This version of mvnc does not support converting layout and precision on the host\n");
+
+            return NC_UNSUPPORTED_FEATURE;
+        } else {
+            memcpy(outputData, packet->data, packet->length);
+        }
+        XLinkReleaseData(handle->streamId);
+    } else {
+        mvLog(MVLOG_ERROR, "Packet reading is failed.");
+        return NC_ERROR;
+    }
+
+    //As user should see an API read to be the same as Graph read, we need to write the element in 2 queues.
+    //if we read it here, we will need to remove the element on the device side
+    //to avoid sending a message just for this purpose, we can send it at the next trigger which touches this FIFO.
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&handle->fifo_mutex), NC_ERROR);
+    handle->api_read_element = 1;
+
+    handle->consumers_remaining--;
+    if (handle->consumers_remaining == 0) {
+        handle->api_read_element = 0;
+        handle->consumers_remaining = handle->consumer_cnt;
+        //no other action required when the element is consumed
+    }
+    popUserParam(handle, userParam, 0);
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&handle->fifo_mutex));
+    *outputDataLen = handle->datasize;
+    mvLog(MVLOG_DEBUG, "num_elements %d userparam %p output length %d\n",
+          handle->num_elements, userParam, handle->datasize);
+    return NC_OK;
+}
+
+ncStatus_t ncFifoRemoveElem(struct ncFifoHandle_t* fifoHandle) {
+    CHECK_HANDLE_CORRECT(fifoHandle)
+
+
+    return NC_UNSUPPORTED_FEATURE;
+}
+
+ncStatus_t ncFifoSetOption(struct ncFifoHandle_t * fifoHandle, int option,
+                           const void *data, unsigned int dataLength)
+{
+    CHECK_HANDLE_CORRECT(fifoHandle);
+    CHECK_HANDLE_CORRECT_RC(data, NC_INVALID_PARAMETERS);
+    CHECK_HANDLE_CORRECT_WINFO(fifoHandle->private_data, MVLOG_ERROR,
+            "fifo handle is corrupt or has been destroyed");
+
+    struct _fifoPrivate_t *f = (struct _fifoPrivate_t *) fifoHandle->private_data;
+    if (f->state != NC_FIFO_CREATED && option != NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR) {
+        mvLog(MVLOG_ERROR, "cannot set Fifo options after allocation");
+        return NC_UNAUTHORIZED;
+    }
+
+    switch (option) {
+    case NC_RW_FIFO_TYPE:{
+            unsigned int size = sizeof(ncFifoType_t);
+            if (dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      dataLength, size);
+                return NC_INVALID_DATA_LENGTH;
+            }
+            int tempType = *(ncFifoType_t *) data;
+            if (tempType != NC_FIFO_HOST_WO && tempType != NC_FIFO_HOST_RO) {
+                 mvLog(MVLOG_ERROR,
+                      "Type value set (%d) is invalid!\n",
+                      tempType);
+                return NC_INVALID_PARAMETERS;
+            }
+            f->type = tempType;
+            break;
+        }
+    case NC_RW_FIFO_CONSUMER_COUNT:{
+            unsigned int size = sizeof(int);
+            if (dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      dataLength, size);
+                return NC_INVALID_DATA_LENGTH;
+            }
+            f->consumer_cnt = *(int *) data;
+            break;
+        }
+    case NC_RW_FIFO_DATA_TYPE:{
+            unsigned int size = sizeof(ncFifoDataType_t);
+            if (dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      dataLength, size);
+                return NC_INVALID_DATA_LENGTH;
+            }
+            int tempDType = *(int *) data;
+            if (tempDType != NC_FIFO_FP16 && tempDType != NC_FIFO_FP32) {
+                mvLog(MVLOG_ERROR,
+                      "dataType value set (%d) is invalid!\n",
+                      tempDType);
+                return NC_INVALID_PARAMETERS;
+            }
+            f->host_tensor_desc.dataType = tempDType;
+            break;
+        }
+    case NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR:{
+            unsigned int size = sizeof(struct ncTensorDescriptor_t);
+            if (dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      dataLength, size);
+                return NC_INVALID_DATA_LENGTH;
+            }
+
+            int expected_total_size = getTotalSize((struct ncTensorDescriptor_t *) data);
+            if (expected_total_size != ((struct ncTensorDescriptor_t *) data)->totalSize) {
+                mvLog(MVLOG_ERROR,
+                      "totalSize in host tensor descriptor (%d) doesn't match expeected totalSize (%d)!\n",
+                      ((struct ncTensorDescriptor_t *) data)->totalSize, expected_total_size);
+                return NC_INVALID_PARAMETERS;
+            }
+            if (f->state == NC_FIFO_ALLOCATED) {
+                struct ncTensorDescriptor_t* temp = (struct ncTensorDescriptor_t*) data;
+                if (temp->w != f->graph_tensor_desc.w ||
+                    temp->h != f->graph_tensor_desc.h ||
+                    temp->c != f->graph_tensor_desc.c ||
+                    temp->n != f->graph_tensor_desc.n)
+                {
+                    mvLog(MVLOG_ERROR, "trying to set host tensor decriptor to a shape that doesn't match graph tensor descriptor shape!\n");
+                    return NC_INVALID_PARAMETERS;
+                }
+            }
+
+            f->host_tensor_desc = *(struct ncTensorDescriptor_t *) data;
+            f->host_tensor_desc_set = 1;
+            f->datasize = getElementSize(f);
+
+            break;
+        }
+    case NC_RW_FIFO_DONT_BLOCK:
+        return NC_UNSUPPORTED_FEATURE;
+        break;
+    case NC_RO_FIFO_CAPACITY:
+    case NC_RO_FIFO_READ_FILL_LEVEL:
+    case NC_RO_FIFO_WRITE_FILL_LEVEL:
+    case NC_RO_FIFO_GRAPH_TENSOR_DESCRIPTOR:
+    case NC_RO_FIFO_STATE:
+    case NC_RO_FIFO_ELEMENT_DATA_SIZE:
+        return NC_UNAUTHORIZED;
+        break;
+    default:
+        return NC_INVALID_PARAMETERS;
+        break;
+    }
+    return NC_OK;
+}
+
+ncStatus_t ncFifoGetOption(struct ncFifoHandle_t * fifoHandle, int option,
+                           void *data, unsigned int *dataLength)
+{
+    CHECK_HANDLE_CORRECT(fifoHandle);
+    CHECK_HANDLE_CORRECT_WINFO(fifoHandle->private_data, MVLOG_ERROR,
+            "Fifo is corrupt or has been destroyed")
+
+    if (!dataLength || (*dataLength != 0 && !data)) {
+        mvLog(MVLOG_ERROR, "Some of the parameters are NULL");
+        return NC_INVALID_PARAMETERS;
+    }
+
+    if (fifoHandle->private_data->state == NC_FIFO_CREATED &&
+        option != NC_RO_FIFO_STATE && option != NC_RW_FIFO_DATA_TYPE &&
+        option != NC_RW_FIFO_DONT_BLOCK && option != NC_RW_FIFO_CONSUMER_COUNT
+        && option != NC_RO_FIFO_NAME && option != NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR) {
+        mvLog(MVLOG_ERROR,
+              "Fifo hasn't been allocated, cannot read those options");
+        return NC_NOT_ALLOCATED;
+    }
+    switch (option) {
+    case NC_RW_FIFO_CONSUMER_COUNT:
+    case NC_RO_FIFO_CAPACITY:
+    case NC_RO_FIFO_READ_FILL_LEVEL:
+    case NC_RO_FIFO_WRITE_FILL_LEVEL:
+    case NC_RO_FIFO_STATE:
+    case NC_RO_FIFO_ELEMENT_DATA_SIZE:
+        {
+            unsigned int size = sizeof(int);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            break;
+        }
+    default:
+        break;
+    }
+
+    switch (option) {
+    case NC_RW_FIFO_TYPE:{
+            unsigned int size = sizeof(ncFifoType_t);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            *(ncFifoType_t *) data = fifoHandle->private_data->type;
+            *dataLength = sizeof(fifoHandle->private_data->type);
+            break;
+        }
+    case NC_RW_FIFO_CONSUMER_COUNT:
+        *(int *) data = fifoHandle->private_data->consumer_cnt;
+        *dataLength = sizeof(fifoHandle->private_data->consumer_cnt);
+        break;
+    case NC_RO_FIFO_ELEMENT_DATA_SIZE:
+        *(int *) data = getElementSize(fifoHandle->private_data);
+        *dataLength = sizeof(fifoHandle->private_data->datasize);
+        break;
+    case NC_RW_FIFO_DATA_TYPE:
+        {
+            unsigned int size = sizeof(ncFifoDataType_t);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            *(int *) data = fifoHandle->private_data->host_tensor_desc.dataType;
+            *dataLength = sizeof(fifoHandle->private_data->host_tensor_desc.dataType);
+            break;
+        }
+    case NC_RO_FIFO_CAPACITY:
+        *(int *) data = fifoHandle->private_data->num_elements;
+        *dataLength = sizeof(fifoHandle->private_data->num_elements);
+        break;
+    case NC_RO_FIFO_GRAPH_TENSOR_DESCRIPTOR:
+        {
+            unsigned int size = sizeof(struct ncTensorDescriptor_t);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            if (fifoHandle->private_data->state != NC_FIFO_ALLOCATED)
+                return NC_UNAUTHORIZED; // before allocation, tensor_desc is NULL
+            *(struct ncTensorDescriptor_t *) data =
+                fifoHandle->private_data->graph_tensor_desc;
+            *dataLength = sizeof(fifoHandle->private_data->graph_tensor_desc);
+            break;
+        }
+    case NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR:
+        {
+            unsigned int size = sizeof(struct ncTensorDescriptor_t);
+            if (*dataLength < size) {
+                mvLog(MVLOG_ERROR,
+                      "data length of output buffer (%d) is smaller that required (%d)!\n",
+                      *dataLength, size);
+                *dataLength = size;
+                return NC_INVALID_DATA_LENGTH;
+            }
+            if (fifoHandle->private_data->state != NC_FIFO_ALLOCATED &&
+                fifoHandle->private_data->host_tensor_desc_set == 0) {
+                mvLog(MVLOG_ERROR,
+                      "option NC_RW_FIFO_HOST_TENSOR_DESCRIPTOR cannot be read before it has been set or before Fifo has been allocated");
+                return NC_UNAUTHORIZED;
+            }
+            *(struct ncTensorDescriptor_t *) data =
+                fifoHandle->private_data->host_tensor_desc;
+            *dataLength = sizeof(fifoHandle->private_data->host_tensor_desc);
+            break;
+        }
+    case NC_RO_FIFO_READ_FILL_LEVEL:
+        {
+            struct _fifoPrivate_t *fi = fifoHandle->private_data;
+            if (!fifoReadAccess(fi))
+                return NC_UNAUTHORIZED;
+
+            *dataLength = sizeof(int);
+            if (fi->state != NC_FIFO_ALLOCATED) {
+                *(int *) data = 0;
+                break;
+            }
+            int fillLevel;
+            if (XLinkGetFillLevel(fi->streamId, 0, &fillLevel) == X_LINK_SUCCESS) {
+                *(int *) data = (fillLevel / fi->graph_tensor_desc.totalSize);
+            } else {
+                return NC_UNAUTHORIZED;
+            }
+
+            break;
+        }
+    case NC_RO_FIFO_WRITE_FILL_LEVEL:
+        {
+            struct _fifoPrivate_t *fi = fifoHandle->private_data;
+            if (!fifoWriteAccess(fi))
+                return NC_UNAUTHORIZED;
+
+            *dataLength = sizeof(int);
+            if (fi->state != NC_FIFO_ALLOCATED) {
+                *(int *) data = 0;
+                break;
+            }
+            int fillLevel;
+            if (XLinkGetFillLevel(fi->streamId, 1, &fillLevel) == X_LINK_SUCCESS) {
+                *(int *) data = (fillLevel / fi->graph_tensor_desc.totalSize);
+            } else {
+                return NC_ERROR;
+            }
+
+            break;
+        }
+    case NC_RW_FIFO_DONT_BLOCK:
+        return NC_UNSUPPORTED_FEATURE; //TODO: XLink support for this (fill level may be enough for it)
+        break;
+    case NC_RO_FIFO_STATE:
+        *(int *) data = fifoHandle->private_data->state;
+        *dataLength = sizeof(int);
+        break;
+    case NC_RO_FIFO_NAME:
+        if (*dataLength < strlen(fifoHandle->private_data->name) + 1) {
+            mvLog(MVLOG_ERROR,
+                  "data length of output buffer (%d) is smaller that required (%zu)!\n",
+                  *dataLength, strlen(fifoHandle->private_data->name) + 1);
+            *dataLength = strlen(fifoHandle->private_data->name) + 1;
+            return NC_INVALID_DATA_LENGTH;
+        }
+        *dataLength = strlen(fifoHandle->private_data->name) + 1;
+        strncpy((char *) data, fifoHandle->private_data->name, *dataLength);
+        break;
+    default:
+        return NC_INVALID_PARAMETERS;
+        break;
+    }
+    return NC_OK;
+}
+
+static ncStatus_t tensorCompatibility(struct ncTensorDescriptor_t *tens1,
+                                      struct ncTensorDescriptor_t *tens2)
+{
+    if (tens1->totalSize != tens2->totalSize ||
+        tens1->n != tens2->n || tens1->c != tens2->c ||
+        tens1->h != tens2->h || tens1->w != tens2->w)
+        return NC_ERROR;
+    return NC_OK;
+}
+
+ncStatus_t ncGraphQueueInference(struct ncGraphHandle_t * graphHandle,
+                                 struct ncFifoHandle_t ** fifoIn,
+                                 unsigned int inFifoCount,
+                                 struct ncFifoHandle_t ** fifoOut,
+                                 unsigned int outFifoCount)
+{
+    mvLog(MVLOG_DEBUG, "Trigger start");
+    CHECK_HANDLE_CORRECT(graphHandle);
+    CHECK_HANDLE_CORRECT(fifoIn);
+    CHECK_HANDLE_CORRECT(fifoOut);
+
+    if (!fifoIn[0] || !fifoOut[0]) {
+        mvLog(MVLOG_ERROR, "Fifos data are NULL");
+        return NC_INVALID_HANDLE;
+    }
+    if (!inFifoCount || !outFifoCount)
+        return NC_INVALID_PARAMETERS;
+
+    struct _graphPrivate_t *g = graphHandle->private_data;
+
+    if(g) {
+        CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&g->dev->graph_stream_m), NC_ERROR);
+    } else {
+        return NC_NOT_ALLOCATED;
+    }
+
+    if (!g || g->state != NC_GRAPH_ALLOCATED) {
+        mvLog(MVLOG_ERROR, "Graph hasn't been allocated");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_NOT_ALLOCATED;
+    }
+
+    if (g->input_count != inFifoCount || g->output_count != outFifoCount) {
+        mvLog(MVLOG_ERROR,
+              "number of input or output fifos is not compatible with graph");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_INVALID_PARAMETERS;
+    }
+
+    if (inFifoCount != 1 || outFifoCount != 1) {
+        mvLog(MVLOG_ERROR,
+              "Currently multiple inputs and outputs are not supported");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_UNSUPPORTED_FEATURE;
+    }
+    struct _fifoPrivate_t *fi = fifoIn[0]->private_data;
+    struct _fifoPrivate_t *fo = fifoOut[0]->private_data;
+    ncStatus_t rc;
+    if (fi->state != NC_FIFO_ALLOCATED || fo->state != NC_FIFO_ALLOCATED) {
+        mvLog(MVLOG_ERROR, "ffos hasn't been allocated");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_NOT_ALLOCATED;
+    }
+    //WO fifos have no graph access
+    if (fo->type == NC_FIFO_HOST_WO) {
+        //graphs have no access to one of the fifos
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_INVALID_PARAMETERS;
+    }
+    if (tensorCompatibility(&fi->graph_tensor_desc, &g->input_tensor_desc) != NC_OK ||
+        tensorCompatibility(&fo->graph_tensor_desc,
+                            &g->output_tensor_desc) != NC_OK) {
+        mvLog(MVLOG_WARN,
+              "Input/Output tensor shape is not compatible with graph");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_INVALID_PARAMETERS;
+    }
+
+    graphMonCommand_t cmd;
+    cmd.cmdClass = GRAPH_MON_CLASS_GRAPH_CMD;
+    cmd.cmd.graphCmd.type = GRAPH_TRIGGER_CMD;
+    cmd.cmd.graphCmd.id = g->id;
+    cmd.cmd.graphCmd.buffId1 = fi->id;
+    cmd.cmd.graphCmd.buffId2 = fo->id;
+
+    void* user_param;
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&fi->fifo_mutex), NC_ERROR);
+    fi->consumers_remaining--;
+
+    if (fi->consumers_remaining == 0) {
+        if (!fi->api_read_element && fifoReadAccess(fi)) {//the element was entirely consumed by graphs. This means we need to free it up from XLink
+            streamPacketDesc_t* packet = 0;
+            XLinkError_t rc = XLinkReadData(fi->streamId, &packet);
+            if (rc) {
+                mvLog(MVLOG_ERROR, "Can't read packet, rc: %s", XLinkErrorToStr(rc));
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fi->fifo_mutex));
+                fi->dev->state = NC_DEVICE_FAILED;
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                return parseXLinkError(rc);
+            }
+            rc = XLinkReleaseData(fi->streamId);
+            if (rc) {
+                mvLog(MVLOG_ERROR,"Failed to release data, rc: %s", XLinkErrorToStr(rc));
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fi->fifo_mutex));
+                fi->dev->state = NC_DEVICE_FAILED;
+                CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+                return parseXLinkError(rc);
+            }
+        }
+        fi->consumers_remaining = fi->consumer_cnt;
+        fi->api_read_element = 0;
+    }
+    popUserParam(fi, &user_param, 1);
+    if (fi->write_count <= fi->consumed_by_graph) {
+        mvLog(MVLOG_WARN, "No point on triggering graph. There are no more elements in the input FIFO");
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fi->fifo_mutex));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_UNAUTHORIZED;
+    }
+    fi->consumed_by_graph++;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fi->fifo_mutex));
+
+    CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&fo->fifo_mutex), NC_ERROR);
+    rc = pushUserParam(fo, user_param , 0);
+    if(rc != NC_OK) {
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fo->fifo_mutex));
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return rc;
+    }
+    fo->write_count++;
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&fo->fifo_mutex));
+
+    if(sendGraphMonitorRequest(g->dev->graph_monitor_stream_id, &cmd)) {
+        mvLog(MVLOG_ERROR, "Can't send trigger request");
+        g->dev->state = NC_DEVICE_FAILED;
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_ERROR;
+    }
+    if(checkGraphMonitorResponse(g->dev->graph_monitor_stream_id)) {
+        mvLog(MVLOG_ERROR, "Can't get trigger response");
+        g->dev->state = NC_DEVICE_FAILED;
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+        return NC_ERROR;
+    }
+    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&g->dev->graph_stream_m));
+    g->started = 1;
+    mvLog(MVLOG_DEBUG, "Trigger end");
+    return NC_OK;
+}
+
+ncStatus_t ncGraphQueueInferenceWithFifoElem(struct ncGraphHandle_t *
+                                             graphHandle,
+                                             struct ncFifoHandle_t * fifoIn,
+                                             struct ncFifoHandle_t * fifoOut,
+                                             const void *inputTensor,
+                                             unsigned int * inputTensorLength,
+                                             void *userParam)
+{
+    ncStatus_t rc = ncFifoWriteElem(fifoIn, inputTensor, inputTensorLength,
+                                    userParam);
+    if (rc != NC_OK)
+        return rc;
+
+    return ncGraphQueueInference(graphHandle, &fifoIn, 1, &fifoOut, 1);
+}
diff --git a/inference-engine/thirdparty/movidius/shared/include/mvLog.h b/inference-engine/thirdparty/movidius/shared/include/mvLog.h
new file mode 100644 (file)
index 0000000..b597477
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+/*
+ * Add logging capabilities over simple printf.
+ * Allows 5 different logging levels:
+ *
+ * MVLOG_DEBUG = 0
+ * MVLOG_INFO = 1
+ * MVLOG_WARN = 2
+ * MVLOG_ERROR = 3
+ * MVLOG_FATAL = 4
+ * Before including header, a unit name can be set, otherwise defaults to global. eg:
+ *
+ * #define MVLOG_UNIT_NAME unitname
+ * #include <mvLog.h>
+ * Setting log level through debugger can be done in the following way:
+ * mset mvLogLevel_unitname 2
+ * Will set log level to warnings and above
+ */
+#ifndef MVLOG_H__
+#define MVLOG_H__
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <time.h>
+
+#ifdef __RTEMS__
+#include <rtems.h>
+#include <rtems/bspIo.h>
+#endif
+
+ // Windows-only
+#if (defined (WINNT) || defined(_WIN32) || defined(_WIN64) )
+#define __attribute__(x)
+#define FUNCATTR_WEAK static
+#else
+#define FUNCATTR_WEAK
+#endif
+
+#ifndef MVLOG_UNIT_NAME
+#define MVLOG_UNIT_NAME global
+#endif
+
+#define _MVLOGLEVEL(UNIT_NAME)  mvLogLevel_ ## UNIT_NAME
+#define  MVLOGLEVEL(UNIT_NAME) _MVLOGLEVEL(UNIT_NAME)
+
+#define STR(x) _STR(x)
+#define _STR(x)  #x
+
+#define UNIT_NAME_STR STR(MVLOG_UNIT_NAME)
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_WHITE   "\x1b[37m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+
+#ifndef MVLOG_DEBUG_COLOR
+#define MVLOG_DEBUG_COLOR ANSI_COLOR_WHITE
+#endif
+
+#ifndef MVLOG_INFO_COLOR
+#define MVLOG_INFO_COLOR ANSI_COLOR_CYAN
+#endif
+
+#ifndef MVLOG_WARN_COLOR
+#define MVLOG_WARN_COLOR ANSI_COLOR_YELLOW
+#endif
+
+#ifndef MVLOG_ERROR_COLOR
+#define MVLOG_ERROR_COLOR ANSI_COLOR_MAGENTA
+#endif
+
+#ifndef MVLOG_FATAL_COLOR
+#define MVLOG_FATAL_COLOR ANSI_COLOR_RED
+#endif
+
+typedef enum mvLog_t{
+    MVLOG_DEBUG = 0,
+    MVLOG_INFO,
+    MVLOG_WARN,
+    MVLOG_ERROR,
+    MVLOG_FATAL,
+    MVLOG_LAST,
+} mvLog_t;
+
+static const char mvLogHeader[MVLOG_LAST][30] =
+{
+    MVLOG_DEBUG_COLOR "D:",
+    MVLOG_INFO_COLOR  "I:",
+    MVLOG_WARN_COLOR  "W:",
+    MVLOG_ERROR_COLOR "E:",
+    MVLOG_FATAL_COLOR "F:"
+};
+
+FUNCATTR_WEAK unsigned int __attribute__ ((weak)) MVLOGLEVEL(MVLOG_UNIT_NAME) = MVLOG_LAST; // not set by default
+
+FUNCATTR_WEAK unsigned int __attribute__ ((weak)) MVLOGLEVEL(default) = MVLOG_WARN;
+
+static int __attribute__ ((unused))
+logprintf(enum mvLog_t lvl, const char * func, const int line,
+                     const char * format, ...)
+{
+    if((MVLOGLEVEL(MVLOG_UNIT_NAME) == MVLOG_LAST && lvl < MVLOGLEVEL(default)))
+        return 0;
+
+    if((MVLOGLEVEL(MVLOG_UNIT_NAME) < MVLOG_LAST && lvl < MVLOGLEVEL(MVLOG_UNIT_NAME)))
+        return 0;
+
+    const char headerFormat[] = "%s [%s] [%10" PRId64 "] %s:%d\t";
+#ifdef __RTEMS__
+    uint64_t timestamp = rtems_clock_get_uptime_nanoseconds() / 1000;
+#elif !defined(_WIN32)
+    struct timespec spec;
+    clock_gettime(CLOCK_REALTIME, &spec);
+    uint64_t timestamp = (spec.tv_sec % 1000) * 1000 + spec.tv_nsec / 1e6;
+#else
+    uint64_t timestamp = 0;
+#endif
+    va_list args;
+    va_start (args, format);
+
+#ifdef __RTEMS__
+    if(!rtems_interrupt_is_in_progress())
+    {
+#endif
+        fprintf(stdout, headerFormat, mvLogHeader[lvl], UNIT_NAME_STR, timestamp, func, line);
+        vfprintf(stdout, format, args);
+        fprintf(stdout, "%s\n", ANSI_COLOR_RESET);
+#ifdef __RTEMS__
+    }
+    else
+    {
+        printk(headerFormat, mvLogHeader[lvl], UNIT_NAME_STR, timestamp, func, line);
+        vprintk(format, args);
+        printk("%s\n", ANSI_COLOR_RESET);
+    }
+#endif
+    va_end (args);
+    return 0;
+}
+
+#define mvLog(lvl, format, ...)                                 \
+    logprintf(lvl, __func__, __LINE__, format, ##__VA_ARGS__)
+
+// Set log level for the current unit. Note that the level must be smaller than the global default
+#define mvLogLevelSet(lvl) if(lvl < MVLOG_LAST){ MVLOGLEVEL(MVLOG_UNIT_NAME) = lvl; }
+// Set the global log level. Can be used to prevent modules from hiding messages (enable all of them with a single change)
+#define mvLogDefaultLevelSet(lvl) if(lvl < MVLOG_LAST){ MVLOGLEVEL(default) = lvl; }
+
+#endif
diff --git a/inference-engine/thirdparty/movidius/shared/include/mvMacros.h b/inference-engine/thirdparty/movidius/shared/include/mvMacros.h
new file mode 100644 (file)
index 0000000..8e21ba9
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+* Copyright 2017-2019 Intel Corporation.
+* The source code, information and material ("Material") contained herein is
+* owned by Intel Corporation or its suppliers or licensors, and title to such
+* Material remains with Intel Corporation or its suppliers or licensors.
+* The Material contains proprietary information of Intel or its suppliers and
+* licensors. The Material is protected by worldwide copyright laws and treaty
+* provisions.
+* No part of the Material may be used, copied, reproduced, modified, published,
+* uploaded, posted, transmitted, distributed or disclosed in any way without
+* Intel's prior express written permission. No license under any patent,
+* copyright or other intellectual property rights in the Material is granted to
+* or conferred upon you, either expressly, by implication, inducement, estoppel
+* or otherwise.
+* Any license under such intellectual property rights must be express and
+* approved by Intel in writing.
+*/
+
+#ifndef MVMACROS_H__
+#define MVMACROS_H__
+
+#define COUNT_OF(x) ((sizeof(x)/sizeof(0[x])) / ((!(sizeof(x) % sizeof(0[x])))))
+#ifndef MIN
+#define MIN(a,b)                                \
+    ({ __typeof__ (a) _a = (a);                 \
+        __typeof__ (b) _b = (b);                \
+        _a < _b ? _a : _b; })
+#endif
+#ifndef MAX
+#define MAX(a,b)                                \
+    ({ __typeof__ (a) _a = (a);                 \
+        __typeof__ (b) _b = (b);                \
+        _a > _b ? _a : _b; })
+#endif
+/// @brief Aligns a pointer or number to a power of 2 value given
+/// @param[in] x number or pointer to be aligned
+/// @param[in] a value to align to (must be power of 2)
+/// @returns the aligned value
+#if (defined(_WIN32) || defined(_WIN64) )
+#define ALIGN_UP_UINT32(x, a)   ((uint32_t)(((uint32_t)(x) + a - 1) & (~(a-1))))
+#define ALIGN_UP_INT32(x, a)   ((int32_t)(((uint32_t)(x) + a - 1) & (~(a-1))))
+#define ALIGN_UP(x, a) ALIGN_UP_UINT32(x,a)
+#else
+#define ALIGN_UP(x, a)   ((typeof(x))(((uint32_t)(x) + a - 1) & (~(a-1))))
+#define ALIGN_DOWN(x, a) ((typeof(x))(((uint32_t)(x)) & (~(a-1))) )
+#define ALIGN_UP_UINT32(_x, _a)   ALIGN_UP(_x, _a)
+#define ALIGN_UP_INT32(_x, _a)   ALIGN_UP(_x, _a)
+#endif
+/// @brief Aligns a integernumber to any value given
+/// @param[in] x integer number to be aligned
+/// @param[in] a value to align to
+/// @returns the aligned value
+#ifndef ROUND_UP
+#define ROUND_UP(x, a)   ((__typeof__(x))((((uint32_t)(x) + a - 1) / a) * a))
+#endif
+#define ROUND_DOWN(x, a) ((__typeof__(x))(((uint32_t)(x) / a + 0) * a))
+
+#if defined(__GNUC__) || defined(__sparc_v8__)
+#define ATTR_UNUSED __attribute__((unused))
+#else
+#define ATTR_UNUSED
+#endif
+
+#endif
+
diff --git a/inference-engine/thirdparty/movidius/watchdog/watchdog.cpp b/inference-engine/thirdparty/movidius/watchdog/watchdog.cpp
new file mode 100644 (file)
index 0000000..13cacbe
--- /dev/null
@@ -0,0 +1,466 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <thread>
+#include <future>
+#include <vector>
+#include <ostream>
+#include <iostream>
+#include <utility>
+#include <watchdog.h>
+#include <watchdogPrivate.hpp>
+#include <algorithm>
+#include <memory>
+#include <XLinkPublicDefines.h>
+#include <ncCommPrivate.h>
+#include <XLink.h>
+#include <mvnc.h>
+#include <ncPrivateTypes.h>
+
+
+#define MVLOG_UNIT_NAME watchdog
+#include <mvLog.h>
+#include <list>
+#define _XLINK_ENABLE_PRIVATE_INCLUDE_
+#include <XLinkPrivateDefines.h>
+
+namespace {
+
+using namespace std;
+using namespace chrono;
+using namespace Watchdog;
+
+/**
+ * @brief implementation of watchdog device using xlink representation of it
+ */
+class XLinkDevice : public IDevice {
+    _devicePrivate_t privateDevice;
+    using time_point = std::chrono::high_resolution_clock::time_point;
+    time_point lastPongTime = time_point::min();
+    time_point lastPingTime = time_point::min();
+    enum : int { deviceHangTimeout = 12000};
+
+public:
+    explicit XLinkDevice(devicePrivate_t *pDevice)
+        : privateDevice(*pDevice) {
+        setInterval(milliseconds(privateDevice.wd_interval));
+    }
+
+    void setInterval(const std::chrono::milliseconds msInterval) noexcept override {
+        privateDevice.wd_interval = std::max(static_cast<int>(msInterval.count()), WATCHDOG_PING_INTERVAL_MS);
+    }
+
+    void keepAlive(const time_point &current_time) noexcept override {
+        bool bPong = sendPingMessage();
+        // we consider that as first pong time even if it wasn't happen as beginning of boot
+        if (lastPongTime == time_point::min()) {
+            lastPongTime = current_time;
+        }
+
+        lastPingTime = current_time;
+
+        int diff = duration_cast<milliseconds>(current_time - lastPongTime).count();
+
+        if (bPong) {
+            lastPongTime = current_time;
+            mvLog(MVLOG_INFO, "[%p] device, ping succeed after %d ms\n", privateDevice.xlink, diff);
+        } else {
+            mvLog(MVLOG_WARN, "[%p] device, no response for %d ms\n", privateDevice.xlink, diff);
+        }
+    }
+
+    milliseconds dueIn(const time_point &current_time) const noexcept override {
+        if (lastPingTime == time_point::min())
+            return milliseconds::zero();
+
+        // overdue
+        if (current_time - lastPingTime > std::chrono::milliseconds(privateDevice.wd_interval)) {
+            return milliseconds::zero();
+        }
+
+        return duration_cast<milliseconds>(lastPingTime + std::chrono::milliseconds(privateDevice.wd_interval) - current_time);
+    }
+
+    /**
+     * @brief means device is hanging
+     */
+    bool isTimeout() const noexcept override {
+        if (lastPongTime > lastPingTime) return false;
+        if (lastPingTime - lastPongTime > milliseconds(deviceHangTimeout)) {
+            // cleaning xlink connection - allowing abort all semaphores waiting in other threads
+            XLinkResetAll();
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * @brief gets some opaque handle that clearly destinguesh one device previate_t from another
+     */
+    void *getHandle() const noexcept override {
+        return privateDevice.xlink;
+    }
+
+private:
+    bool sendPingMessage() {
+        XLinkError_t rc = X_LINK_SUCCESS;
+        CHECK_MUTEX_SUCCESS_RC(pthread_mutex_lock(&privateDevice.dev_stream_m), false);
+
+        deviceCommand_t config;
+        config.type.c1 = CLASS1_WATCHDOG_PING;
+        config.optionClass = NC_OPTION_CLASS1;
+
+        // xlink ping acknowledge interval shouldn't be more then expected ping interval
+        rc = XLinkWriteDataWithTimeout(privateDevice.device_mon_stream_id, (const uint8_t*)&config, sizeof(config), deviceHangTimeout);
+
+        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&privateDevice.dev_stream_m));
+
+        if (rc != X_LINK_SUCCESS) {
+            mvLog(MVLOG_ERROR, "Failed send ping message: %s", XLinkErrorToStr(rc));
+            return false;
+        }
+        return true;
+    }
+};
+
+/**
+ * @brief when device just added into watchdog, it should not be due interval at all
+ */
+class NoDueOnFirstCall : public IDevice {
+    std::shared_ptr<IDevice> original;
+    bool bFirstCall = false;
+ public:
+    NoDueOnFirstCall(const std::shared_ptr<IDevice> & original) : original(original) {}
+    void setInterval(const std::chrono::milliseconds msInterval) noexcept override {
+        original->setInterval(msInterval);
+    }
+    void keepAlive(const time_point &current_time) noexcept override  {
+        original->keepAlive(current_time);
+        bFirstCall = true;
+    }
+    std::chrono::milliseconds dueIn(const time_point &current_time) const noexcept override {
+        if (!bFirstCall) {
+            return milliseconds::zero();
+        }
+        return original->dueIn(current_time);
+    }
+    bool isTimeout() const noexcept override {
+        return original->isTimeout();
+    }
+    void *getHandle() const noexcept override {
+        return original->getHandle();
+    }
+};
+
+static void * WD_OPAQUE_MAGIC = reinterpret_cast<void*>(0xdeadbeaf);
+
+struct wd_context_opaque {
+    void * magic = WD_OPAQUE_MAGIC;
+    IDevice * actual = nullptr;
+    bool   destroyed = false;
+    void *handleCached = nullptr;
+};
+
+class WatchdogImpl {
+    enum : uint8_t {
+        STATE_IDLE = 0,
+        INITIATE_THREAD_STOP = 1,
+        THREAD_EXITED = 2,
+        WAKE_UP_THREAD = 3,
+    };
+
+    using wd_context_as_tuple = std::tuple<std::shared_ptr<IDevice>, bool*, void*>;
+
+    using Devices = std::list<wd_context_as_tuple>;
+    Devices watchedDevices;
+    std::mutex devicesListAcc;
+    std::atomic<int> generation = {0};
+    std::atomic_bool threadRunning;
+    volatile std::uint8_t notificationReason = STATE_IDLE;
+    std::condition_variable wakeUpPingThread;
+
+    std::thread poolThread;
+
+    WatchdogImpl() = default;
+    WatchdogImpl(const WatchdogImpl&) = delete;
+    WatchdogImpl(WatchdogImpl&&) = delete;
+    WatchdogImpl& operator = (const WatchdogImpl&) = delete;
+    WatchdogImpl& operator = (WatchdogImpl&&) = delete;
+ public:
+
+    static WatchdogImpl &instance() {
+        static WatchdogImpl watchdog;
+        return watchdog;
+    }
+
+    ~WatchdogImpl() {
+        mvLog(MVLOG_INFO, "watchdog terminated\n");
+        {
+            auto __lock = lock();
+            for (auto &item : watchedDevices) {
+                *std::get<1>(item) = true;
+                mvLog(MVLOG_WARN, "[%p] device, stop watching due to watchdog termination\n", std::get<2>(item));
+            }
+            notificationReason = THREAD_EXITED;
+        }
+
+        wakeUpPingThread.notify_one();
+
+        if (poolThread.joinable()) {
+            poolThread.join();
+        }
+    }
+
+public:
+    void *register_device(std::shared_ptr<IDevice> device) {
+        auto __locker = lock();
+        std::unique_ptr<wd_context_opaque> ctx (new wd_context_opaque);
+
+        // rare case of exact pointer address collision
+        if (ctx.get() == WD_OPAQUE_MAGIC) {
+            std::unique_ptr<wd_context_opaque> ctx2(new wd_context_opaque);
+            ctx.reset(ctx2.release());
+        }
+
+        if (!threadRunning) {
+            if (poolThread.joinable()) {
+                poolThread.join();
+            }
+            threadRunning = true;
+
+            poolThread = std::thread([this]() {
+                watchdog_routine();
+            });
+        } else {
+            // wake up thread
+            notificationReason = WAKE_UP_THREAD;
+            wakeUpPingThread.notify_one();
+        }
+
+        ctx->handleCached = device->getHandle();
+        watchedDevices.emplace_back(device, &ctx->destroyed, ctx->handleCached);
+
+        ctx->actual = std::get<0>(watchedDevices.back()).get();
+
+        return ctx.release();
+    }
+
+    void *register_device(devicePrivate_t *device) {
+        return register_device(std::make_shared<NoDueOnFirstCall>(std::make_shared<XLinkDevice>(device)));
+    }
+
+    bool remove_device(void *opaque) {
+        mvLog(MVLOG_INFO, "remove_device : %p\n", opaque);
+        auto ptr  = reinterpret_cast<wd_context_opaque *>(opaque);
+        if (ptr == nullptr) {
+            return false;
+        }
+        auto __locker = lock();
+
+        // thread already removed
+        if (ptr->destroyed) {
+            delete ptr;
+            return true;
+        }
+
+        auto idx = std::find_if(std::begin(watchedDevices),
+                                std::end(watchedDevices),
+                                [ptr](const wd_context_as_tuple &item) {
+                                    return std::get<0>(item)->getHandle() == ptr->actual->getHandle();
+                                });
+        bool bFound = idx != std::end(watchedDevices);
+        watchedDevices.erase(idx);
+
+        // wake up thread since we might select removed device as nex to be ping, and there is no more devices available
+        notificationReason = WAKE_UP_THREAD;
+        __locker.unlock();
+        wakeUpPingThread.notify_one();
+
+        return bFound;
+    }
+
+    void clear() {
+        {
+            mvLog(MVLOG_INFO, "clear\n");
+            auto __locker = lock();
+            watchedDevices.clear();
+            notificationReason = WAKE_UP_THREAD;
+        }
+        // wake up thread
+        wakeUpPingThread.notify_one();
+    }
+
+ private:
+    std::unique_lock<std::mutex> lock() {
+        return std::unique_lock<std::mutex>(devicesListAcc);
+    }
+
+    void watchdog_routine() noexcept {
+        try {
+            mvLog(MVLOG_INFO, "thread started\n");
+
+            milliseconds sleepInterval;
+            auto __locker = lock();
+            do {
+                for (auto deviceIt = watchedDevices.begin(); deviceIt != watchedDevices.end(); ) {
+                    auto &device = std::get<0>(*deviceIt);
+                    auto isReady = device->dueIn(high_resolution_clock::now()).count() == 0;
+                    if (isReady) {
+                        auto now = high_resolution_clock::now();
+                        device->keepAlive(high_resolution_clock::now());
+                        mvLog(MVLOG_DEBUG, "ping completed in %ld ms\n", duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now()-now).count());
+                    }
+                    if (device->isTimeout()) {
+                        mvLog(MVLOG_ERROR, "[%p] device, not respond, removing from watchdog\n", device->getHandle());
+                        // marking device as deleted, to prevent double resource free from wd_unregister_device
+                        *std::get<1>(*deviceIt) = true;
+                        deviceIt = watchedDevices.erase(deviceIt);
+                    }
+                    else {
+                        ++deviceIt;
+                    }
+                }
+                auto currentTime = high_resolution_clock::now();
+                auto minInterval = std::min_element(watchedDevices.begin(),
+                                                    watchedDevices.end(),
+                                                    [&currentTime] (const Devices::value_type & device1, const Devices::value_type & device2) {
+                                                        return std::get<0>(device1)->dueIn(currentTime).count()
+                                                            < std::get<0>(device2)->dueIn(currentTime).count();
+                                                    });
+                // if for some reason we have empty devices list but watchdog is active
+                if (minInterval == watchedDevices.end()) {
+                    mvLog(MVLOG_INFO, "no active devices to watch, stopping  Watchdog thread\n");
+                    threadRunning = false;
+                    break;
+                }
+                // TODO: no timer coalescing feature, to minimized thread wakes
+                sleepInterval = std::get<0>(*minInterval)->dueIn(currentTime);
+                mvLog(MVLOG_DEBUG, "sleep interval = %ld ms\n", sleepInterval.count());
+
+                notificationReason = STATE_IDLE;
+
+                wakeUpPingThread.wait_until(__locker, currentTime + sleepInterval, [this, currentTime]() {
+                    mvLog(MVLOG_DEBUG,
+                          "waiting for %ld ms\n",
+                          duration_cast<std::chrono::milliseconds>(high_resolution_clock::now() - currentTime).count());
+                    return notificationReason != STATE_IDLE;
+                });
+
+                mvLog(MVLOG_DEBUG, "waiting completed in  %ld ms\n",
+                      duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now() - currentTime).count());
+            } while (notificationReason != THREAD_EXITED);
+
+        } catch (const std::exception & ex) {
+            mvLog(MVLOG_ERROR, "error %s\n", ex.what());
+        } catch (...) {
+            mvLog(MVLOG_ERROR, "error\n");
+        }
+        mvLog(MVLOG_INFO, "thread ended\n");
+        threadRunning = false;
+    }
+};
+
+}  // namespace
+
+WD_API wd_error_t watchdog_init_context(wd_context *ctx) {
+    try {
+        mvLogLevelSet(MVLOG_ERROR);
+        mvLogDefaultLevelSet(MVLOG_ERROR);
+        if (!ctx) {
+            return WD_NOTINITIALIZED;
+        }
+        // opaque pointer initialized
+        if (ctx->opaque == WD_OPAQUE_MAGIC) {
+            mvLog(MVLOG_INFO, "watchdog context (%p) already initialized \n", ctx);
+        } else {
+            ctx->opaque = WD_OPAQUE_MAGIC;
+        }
+        return WD_ERRNO;
+    }  catch (...) {
+        mvLog(MVLOG_ERROR, "failed initialize watchdog context: %p\n", ctx);
+    }
+    return WD_FAIL;
+}
+
+WD_API wd_error_t watchdog_register_device(wd_context * ctx, devicePrivate_t *device) {
+    try {
+        if (!ctx) {
+            mvLog(MVLOG_ERROR, "watchdog context is null\n");
+            return WD_NOTINITIALIZED;
+        }
+        // opaque pointer initialized
+        if (ctx->opaque == nullptr) {
+            mvLog(MVLOG_ERROR, "watchdog context (%p) not initialized \n", ctx);
+            return WD_NOTINITIALIZED;
+        }
+        if (device && device->wd_interval <= 0) {
+            mvLog(MVLOG_ERROR, "watchdog interval should be > 0, but was (%d)\n", device->wd_interval);
+            return WD_NOTINITIALIZED;
+        }
+        // opaque pointer initialized
+        if (ctx->opaque != WD_OPAQUE_MAGIC) {
+            auto watchee = reinterpret_cast<wd_context_opaque*>(ctx->opaque);
+            // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
+            if (watchee->magic == WD_OPAQUE_MAGIC) {
+                // actually this can represent already registered context, so need to check
+                // since we are adding NoDue wrapper, lets check for it
+                if (nullptr != dynamic_cast<NoDueOnFirstCall*>(watchee->actual)) {
+                    mvLog(MVLOG_ERROR, "watchdog context (%p) already registered within watchdog\n", ctx);
+                    return WD_DUPLICATE;
+                }
+
+                // transferring interval from context
+                if (device) {
+                    watchee->actual->setInterval(milliseconds(device->wd_interval));
+                }
+                ctx->opaque = WatchdogImpl::instance().register_device(
+                    shared_ptr<IDevice>(new NoDueOnFirstCall(shared_ptr<IDevice>(watchee->actual, [](IDevice*){}))));
+
+                if (ctx->opaque == nullptr) {
+                    mvLog(MVLOG_ERROR, "watchdog context (%p) not initialized \n", ctx);
+                } else {
+                    return WD_ERRNO;
+                }
+            }
+            mvLog(MVLOG_ERROR, "watchdog context (%p) not initialized \n", ctx);
+            return WD_NOTINITIALIZED;
+        }
+
+        if (device && device->wd_interval > 0) {
+            ctx->opaque = WatchdogImpl::instance().register_device(device);
+        } else {
+            ctx->opaque = nullptr;
+        }
+        return WD_ERRNO;
+    } catch (const std::exception & ex) {
+        mvLog(MVLOG_ERROR, "failed to register device: %s\n", ex.what());
+    } catch (...) {
+        mvLog(MVLOG_ERROR, "failed to register device context (%p)\n", ctx);
+    }
+    return WD_FAIL;
+}
+
+WD_API wd_error_t watchdog_unregister_device(wd_context *ctx) {
+    if (ctx == nullptr || ctx->opaque == nullptr) {
+        return WD_NOTINITIALIZED;
+    } else {
+        if (ctx->opaque != WD_OPAQUE_MAGIC) {
+            auto watchee = reinterpret_cast<wd_context_opaque*>(ctx->opaque);
+            // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
+            if (watchee->magic == WD_OPAQUE_MAGIC) {
+                if (!WatchdogImpl::instance().remove_device(ctx->opaque)) {
+                    mvLog(MVLOG_WARN, "cannot remove device\n");
+                    return WD_FAIL;
+                }
+            }
+        }
+    }
+
+    if (ctx != nullptr) {
+        // opaque pointer deleted
+        ctx->opaque = nullptr;
+    }
+
+    return WD_ERRNO;
+}
diff --git a/inference-engine/thirdparty/movidius/watchdog/watchdog.h b/inference-engine/thirdparty/movidius/watchdog/watchdog.h
new file mode 100644 (file)
index 0000000..026f409
--- /dev/null
@@ -0,0 +1,51 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef MVNC_WATCHDOG_H
+#define MVNC_WATCHDOG_H
+
+#include <mvnc.h>
+#ifdef __cplusplus
+# define WD_API  extern "C"
+# else
+# define WD_API
+#endif
+
+/**
+* @brief default ping interval is 1 second
+*/
+#define WATCHDOG_PING_INTERVAL_MS 1000
+
+typedef struct wd_context_tag {
+    void * opaque;
+} wd_context;
+
+typedef enum {
+    WD_ERRNO = 0,
+    WD_NOTINITIALIZED,
+    WD_DUPLICATE,
+    WD_FAIL
+} wd_error_t;
+
+/**
+ * @brief initializes watchdog context, required to be called before any other WD API calls
+ * @return
+ */
+WD_API wd_error_t watchdog_init_context(wd_context *ctx);
+
+/**
+ * @brief creates watchdog thread, if not created, and registers new watchee device, and initialise opaque handle to it
+ * @param d - newly connected device descriptor
+ * @return
+ */
+WD_API wd_error_t watchdog_register_device(wd_context *ctx, devicePrivate_t *d);
+
+/**
+ * @brief remove watch_dog device from the list, and might stop watchdog worker thread
+ * @return result of operation
+ */
+WD_API wd_error_t watchdog_unregister_device(wd_context *ctx);
+
+
+#endif  // MVNC_WATCHDOG_H
diff --git a/inference-engine/thirdparty/movidius/watchdog/watchdogPrivate.hpp b/inference-engine/thirdparty/movidius/watchdog/watchdogPrivate.hpp
new file mode 100644 (file)
index 0000000..5c6acb2
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <chrono>
+
+namespace Watchdog {
+
+/**
+ * @brief represents watchdog device interface to be registered within watchdog worker
+ */
+class IDevice {
+ public:
+    using time_point = std::chrono::high_resolution_clock::time_point;
+
+    virtual ~IDevice() = default;
+
+    /**
+     * @brief depending on implementation watchdog device shouldn't have interval longer than that
+     */
+    virtual void setInterval(const std::chrono::milliseconds msInterval) noexcept = 0;
+    /**
+     * @brief watchdog request device to keep alive with current timestamp
+     */
+    virtual void keepAlive(const time_point &current_time) noexcept = 0;
+    /**
+     * @brief means we need to ping it after corresponding time
+     */
+    virtual std::chrono::milliseconds dueIn(const time_point &current_time) const noexcept = 0;
+    /**
+     * @brief whether device is hanging
+     */
+    virtual bool isTimeout() const noexcept = 0;
+    /**
+     * @brief gets opaque handle that clearly identifies watchdog device, ex.: usb connection identifier
+     */
+    virtual void *getHandle() const noexcept = 0;
+};
+
+}  // namespace Watchdog
diff --git a/inference-engine/tools/CMakeLists.txt b/inference-engine/tools/CMakeLists.txt
new file mode 100644 (file)
index 0000000..13ab365
--- /dev/null
@@ -0,0 +1,30 @@
+# Copyright (C) 2018-2019 Intel Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#      http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################
+## to use C++11
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+    set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+endif()
+####################################
+
+if (WIN32)
+    # add_compile_options("/WX")
+else()
+    add_compile_options("-Werror")
+endif()
+
+add_subdirectory(vpu)
index 6402705..e6cea53 100644 (file)
@@ -1,11 +1,12 @@
-# OpenVINOâ„¢ Calibration Tool
-Inference Engine Calibration Tool calibrates a given FP32 model so that you can run calibrated model in low-precision 8-bit integer mode while keeping the input data of this model in the original precision.
-Inference Engine Calibration Tool is a Python\* command-line tool, which imports Python types from the `openvino.tools.calibration` package.
+# Python* Calibration Tool
 
-Please, refer to https://docs.openvinotoolkit.org for details.
+The Python* Calibration Tool calibrates a given FP32 model so that you can run calibrated model in low-precision 8-bit integer mode while keeping the input data of this model in the original precision.
+The Calibration Tool is a Python\* command-line tool, which imports Python types from the `openvino.tools.calibration` package.
+
+> **NOTE**: INT8 models are currently supported only by the CPU plugin. For the full list of supported configurations, see the [Supported Devices](./docs/IE_DG/supported_plugins/Supported_Devices.md) topic.
 
 ## Hardware requirements
-Hardware requirements depend on a model. Typically for public models RAM memory size has to be not less then 16Gb, drive has to have not less then 30 Gb free space independently on operation system. Temporary directory is used to cache layers output during calibration.
+Hardware requirements depend on a model. Typically for public models RAM memory size has to be not less then 16Gb, drive has to have not less then 30 GB free space independently on operation system. Temporary directory is used to cache layers output during calibration.
 
 ## Usage
 The Calibration Tool is configured in the same way as the Accuracy Checker. You can also use additional command-line arguments to define calibration-specific parameters.
@@ -17,13 +18,13 @@ The Calibration Tool is configured in the same way as the Accuracy Checker. You
 | -d, --definitions                            | string | Optional. Path to the YML file with definitions         |
 | -m, --models                                 | string | Optional. Prefix path to the models and weights         |
 | -s, --source                                 | string | Optional. Prefix path to the data source                |
-| -a, --annotations                            | string | Optional. Pefix path to the converted annotations and datasets meta data |
+| -a, --annotations                            | string | Optional. Prefix path to the converted annotations and datasets meta data |
 | -e, --extensions                             | string | Optional. Prefix path to extensions folder              |
-| --cpu_extensions_mode, --cpu-extensions-mode | string | Optional. specified preferable set of processor instruction for automatic searching cpu extension lib: `avx2` or `sse4` |
+| --cpu_extensions_mode, --cpu-extensions-mode | string | Optional. specified preferable set of processor instruction for automatic searching the CPU extension lib: `avx2` or `sse4` |
 | -C, --converted_models, --converted-models   | string | Optional. Directory to store Model Optimizer converted models. Used for DLSDK launcher only |
-| -M, --model_optimizer, --model-optimizer     | string | Optional. Path to model optimizer caffe directory       |
-| --tf_custom_op_config_dir, --tf-custom-op-config-dir | string | Optional. Path to directory with tensorflow custom operation configuration files for model optimizer |
-| --tf_obj_detection_api_pipeline_config_path, --tf-obj-detection-api-pipeline-config-path | string | Optional. Path to directory with tensorflow object detection api pipeline configuration files for model optimizer |
+| -M, --model_optimizer, --model-optimizer     | string | Optional. Path to model optimizer Caffe* directory       |
+| --tf_custom_op_config_dir, --tf-custom-op-config-dir | string | Optional. Path to directory with TensorFlow* custom operation configuration files for model optimizer |
+| --tf_obj_detection_api_pipeline_config_path, --tf-obj-detection-api-pipeline-config-path | string | Optional. Path to directory with TensorFlow object detection API pipeline configuration files for the Model Optimizer |
 | --progress                                   | string | Optional. Progress reporter: `bar`, `print` or `None`   |
 | -td, --target_devices, --target-devices      | string | Optional. Space-separated list of devices for infer     |
 | -tt, --target_tags, --target-tags | string   | Optional. Space-separated list of launcher tags for infer        |
@@ -38,15 +39,15 @@ The Calibration Tool is configured in the same way as the Accuracy Checker. You
 | --ignore_layer_names_path, --ignore-layer-names-path | string | Optional. Ignore layer names file path |
 | --batch_size, --batch-size        | integer| Optional. Batch size value. If not specified, the batch size value is determined from IR |
 | -th, --threshold                  | float | Optional. Accuracy drop of quantized model should not exceed this threshold. Should be pointer in percents without percent sign. (1% is default) |
-| -ic, --benchmark_iterations_count, --benchmark-iterations-count | integer | Optional. Benchmark itertations count. (1000 is default) |
+| -ic, --benchmark_iterations_count, --benchmark-iterations-count | integer | Optional. Benchmark iterations count (1000 is default). |
 | -mn, --metric_name, --metric-name | string | Optional. Metric name used during calibration |
 | -mt, --metric_type, --metric-type | string | Optional. Metric type used during calibration |
 | -o, --output_dir, --output-dir    | string | Optional. Directory to store converted models. Original model directory is used if not defined |
 
-## Model calibration flow
+## Model Calibration Flow
 
 ### Introduction
-Calibration tool read original FP32 model, calibration dataset and create low precision model. Low precision model has two differences from original model:
+The calibration tool read original FP32 model, calibration dataset and create low precision model. Low precision model has two differences from original model:
 1. Per channel statistics are defined. Statistics have minimum and maximum values for each layer and each channel. Model statistics are stored in Inference Engine intermediate representation file (IR) in XML format.
 2. `quantization_level` layer attribute is defined. The attribute defines precision which is used during inference.
 
@@ -70,7 +71,7 @@ There are steps to calibrate and evaluate result model:
 
 Additional optional step before calibration is available to rough estimate possible INT8 performance.
 
-### Step #1. Convert data annotation files
+### Step #1. Convert Data Annotation Files
 Calibration dataset is subset of training dataset. Use Convert Annotation Tool to convert ImageNet\* dataset to Calibration Tool readable data annotation files. Data annotation files describe subset of images which are used during calibration. Command line:
 ```sh
 python convert_annotation.py imagenet --annotation_file /datasets/ImageNet/val.txt --labels_file /datasets/ImageNet/synset_words.txt -ss 2000 -o ~/annotations -a imagenet.pickle -m imagenet.json
@@ -90,8 +91,9 @@ python convert_annotation.py imagenet --annotation_file /datasets/ImageNet/val.t
 | --converted_models | string | Directory to store Model Optimizer converted models. Used for DLSDK launcher only |
 
 
-### Optional step for low precision model performance estimation.
-Before calibration you can roughly estimate low presition performance with [Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md).
+### Optional Step for Low Precision Model Performance Estimation
+
+Before calibration, you can roughly estimate low precision performance with [Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md).
 
 [Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md) ignores metric in YML configuration file but you can use the same command line arguments.
 
@@ -101,7 +103,7 @@ Command line:
 python collect_statistics.py --config ~/inception_v1.yml -d ~/defenitions.yml -M /home/user/intel/openvino/deployment_tools/model_optimizer --models ~/models --source /media/user/calibration/datasets --annotations ~/annotations --converted_models ~/models
 ```
 
-Result model has statistics which allow to infer this model in INT8 precision. To measure performance you can use [Benchmark Tool](./inference-engine/tools/benchmark_tool/README.md).
+Result model has statistics which allow you to infer this model in INT8 precision. To measure performance you can use [Benchmark Tool](./inference-engine/tools/benchmark_tool/README.md).
 
 ### Step #2. Calibration
 During calibration process, the model is ajusted for efficient quantization and minimization of accuracy drop on calibration dataset. Calibration tool produces calibrated model which will be executed in low precision 8 bit quantzed mode after loading into CPU plugin.
diff --git a/inference-engine/tools/vpu/CMakeLists.txt b/inference-engine/tools/vpu/CMakeLists.txt
new file mode 100644 (file)
index 0000000..a3b3333
--- /dev/null
@@ -0,0 +1,21 @@
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(ENABLE_MYRIAD)
+    add_subdirectory(vpu_profile)
+endif()
+
+if (ENABLE_MYRIAD)
+    add_subdirectory(vpu_compile)
+endif()
diff --git a/inference-engine/tools/vpu/common/vpu_tools_common.cpp b/inference-engine/tools/vpu/common/vpu_tools_common.cpp
new file mode 100644 (file)
index 0000000..d845183
--- /dev/null
@@ -0,0 +1,319 @@
+//
+// Copyright (C) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/* on windows min and max already defined that makes using numeric_limits impossible */
+#if defined(WIN32)
+#define NOMINMAX
+#endif
+
+#include <sys/stat.h>
+#include <w_dirent.h>
+
+#include <algorithm>
+#include <map>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <string>
+#include <limits>
+
+#include "vpu_tools_common.hpp"
+#include "vpu/utils/string.hpp"
+#include "samples/common.hpp"
+
+#include "precision_utils.h"
+
+InferenceEngine::CNNNetwork readNetwork(const std::string &xmlFileName) {
+    std::string binFileName = fileNameNoExt(xmlFileName) + ".bin";
+
+    InferenceEngine::CNNNetReader reader;
+    reader.ReadNetwork(xmlFileName);
+    reader.ReadWeights(binFileName);
+
+    return reader.getNetwork();
+}
+
+InferenceEngine::InferencePlugin loadPlugin(const std::string &plugin, const std::string &plugin_path) {
+    /* Unfortunately, there is no check on invalid device inside IE API */
+    return InferenceEngine::PluginDispatcher({plugin_path}).getPluginByDevice(plugin);
+}
+
+void setPrecisions(const InferenceEngine::CNNNetwork &network) {
+    for (auto &&layer : network.getInputsInfo()) {
+        layer.second->setPrecision(InferenceEngine::Precision::FP16);
+    }
+
+    for (auto &&layer : network.getOutputsInfo()) {
+        layer.second->setPrecision(InferenceEngine::Precision::FP16);
+    }
+}
+
+std::map<std::string, std::string> parseConfig(const std::string &configName, char comment) {
+    std::map<std::string, std::string> config = {};
+
+    std::ifstream file(configName);
+    if (!file.is_open()) {
+        return config;
+    }
+
+    std::string key, value;
+    while (file >> key >> value) {
+        if (key.empty() || key[0] == comment) {
+            continue;
+        }
+        config[key] = value;
+    }
+
+    return config;
+}
+
+BitMap::BitMap(const std::string &filename) {
+    BmpHeader header;
+    BmpInfoHeader infoHeader;
+
+    std::ifstream input(filename, std::ios::binary);
+    if (!input) {
+        return;
+    }
+
+    input.read(reinterpret_cast<char *>(&header.type), 2);
+
+    if (header.type != 'M'*256+'B') {
+        std::cerr << "[BMP] file is not bmp type\n";
+        return;
+    }
+
+    input.read(reinterpret_cast<char *>(&header.size), 4);
+    input.read(reinterpret_cast<char *>(&header.reserved), 4);
+    input.read(reinterpret_cast<char *>(&header.offset), 4);
+
+    input.read(reinterpret_cast<char *>(&infoHeader), sizeof(BmpInfoHeader));
+
+    bool rowsReversed = infoHeader.height < 0;
+    _width  = static_cast<std::size_t>(infoHeader.width);
+    _height = static_cast<std::size_t>(std::abs(infoHeader.height));
+
+    if (infoHeader.bits != 24) {
+        std::cerr << "[BMP] 24bpp only supported. But input has:" << infoHeader.bits << "\n";
+        return;
+    }
+
+    if (infoHeader.compression != 0) {
+        std::cerr << "[BMP] compression not supported\n";
+    }
+
+    auto padSize = _width & 3;
+    char pad[3];
+    size_t size = _width * _height * 3;
+
+    _data.reset(new unsigned char[size], std::default_delete<unsigned char[]>());
+
+    input.seekg(header.offset, std::ios::beg);
+
+    // reading by rows in invert vertically
+    for (uint32_t i = 0; i < _height; i++) {
+        uint32_t storeAt = rowsReversed ? i : (uint32_t)_height - 1 - i;
+        input.read(reinterpret_cast<char *>(_data.get()) + _width * 3 * storeAt, _width * 3);
+        input.read(pad, padSize);
+    }
+}
+
+void loadImage(const std::string &imageFilename, InferenceEngine::Blob::Ptr &blob) {
+    InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
+    if (tensDesc.getPrecision() != InferenceEngine::Precision::FP16) {
+        throw std::invalid_argument("Input must have FP16 precision");
+    }
+
+    BitMap reader(imageFilename);
+
+    size_t batch = blob->dims()[3];
+    size_t w = blob->dims()[0];
+    size_t h = blob->dims()[1];
+    size_t img_w = reader.width();
+    size_t img_h = reader.height();
+
+    auto numBlobChannels = blob->dims()[2];
+    size_t numImageChannels = reader.size() / (reader.width() * reader.height());
+    if (numBlobChannels != numImageChannels && numBlobChannels != 1) {
+        throw std::invalid_argument("Input channels mismatch: image channels " + std::to_string(numImageChannels) +
+                                    ", network channels " + std::to_string(numBlobChannels) +
+                                    ", expecting count of image channels are equal to count if network channels"
+                                    "or count of network channels are equal to 1");
+    }
+
+    int16_t *blobDataPtr = std::dynamic_pointer_cast<InferenceEngine::TBlob<int16_t>>(blob)->data();
+    auto nPixels = w * h;
+    unsigned char *RGB8 = reader.getData().get();
+    float xscale = 1.0f * img_w / w;
+    float yscale = 1.0f * img_h / h;
+
+    for (std::size_t n = 0; n != batch; n++) {
+        for (std::size_t i = 0; i < h; ++i) {
+            int y = static_cast<int>(std::floor((i + 0.5f) * yscale));
+            for (std::size_t j = 0; j < w; ++j) {
+                int x = static_cast<int>(std::floor((j + 0.5f) * xscale));
+                for (std::size_t k = 0; k < numBlobChannels; k++) {
+                    float src = 1.0f * RGB8[(y * img_w + x) * numImageChannels + k];
+                    if (tensDesc.getLayout() == InferenceEngine::NHWC) {
+                        blobDataPtr[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] =
+                            InferenceEngine::PrecisionUtils::f32tof16(src);
+                    } else {
+                        blobDataPtr[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] =
+                            InferenceEngine::PrecisionUtils::f32tof16(src);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void printPerformanceCounts(const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& perfMap) {
+    std::vector<std::pair<std::string, InferenceEngine::InferenceEngineProfileInfo>> perfVec(perfMap.begin(),
+                                                                                             perfMap.end());
+    std::sort(perfVec.begin(), perfVec.end(),
+        [=](const std::pair<std::string, InferenceEngine::InferenceEngineProfileInfo> &pair1,
+          const std::pair<std::string, InferenceEngine::InferenceEngineProfileInfo> &pair2) -> bool {
+          return pair1.second.execution_index < pair2.second.execution_index;
+        });
+
+    size_t maxLayerName = 0u, maxExecType = 0u;
+    for (auto &&entry : perfVec) {
+        maxLayerName = std::max(maxLayerName, entry.first.length());
+        maxExecType = std::max(maxExecType, std::strlen(entry.second.exec_type));
+    }
+
+    size_t indexWidth = 7, nameWidth = maxLayerName + 5, typeWidth = maxExecType + 5, timeWidth = 10;
+    size_t totalWidth = indexWidth + nameWidth + typeWidth + timeWidth;
+
+    std::cout << std::endl << "Detailed Per Stage Profile" << std::endl;
+    for (size_t i = 0; i < totalWidth; i++)
+        std::cout << "=";
+    std::cout << std::endl;
+    std::cout << std::setw(static_cast<int>(indexWidth)) << std::left << "Index"
+              << std::setw(static_cast<int>(nameWidth)) << std::left << "Name"
+              << std::setw(static_cast<int>(typeWidth)) << std::left << "Type"
+              << std::setw(static_cast<int>(timeWidth)) << std::right << "Time (ms)"
+              << std::endl;
+
+    for (size_t i = 0; i < totalWidth; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+
+    long long totalTime = 0;
+    for (const auto& p : perfVec) {
+        const auto& stageName = p.first;
+        const auto& info = p.second;
+        if (info.status == InferenceEngine::InferenceEngineProfileInfo::EXECUTED) {
+            std::cout << std::setw(static_cast<int>(indexWidth)) << std::left << info.execution_index
+                      << std::setw(static_cast<int>(nameWidth))  << std::left << stageName
+                      << std::setw(static_cast<int>(typeWidth))  << std::left << info.exec_type
+                      << std::setw(static_cast<int>(timeWidth))  << std::right << info.realTime_uSec / 1000.0
+                      << std::endl;
+
+            totalTime += info.realTime_uSec;
+        }
+    }
+
+    for (std::size_t i = 0; i < totalWidth; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+    std::cout << std::setw(static_cast<int>(totalWidth / 2)) << std::right << "Total inference time:"
+              << std::setw(static_cast<int>(totalWidth / 2 + 1)) << std::right << totalTime / 1000.0
+              << std::endl;
+    for (std::size_t i = 0; i < totalWidth; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+}
+
+std::vector<std::string> extractFilesByExtension(const std::string& directory, const std::string& extension) {
+    return extractFilesByExtension(directory, extension, std::numeric_limits<std::size_t>::max());
+}
+
+std::vector<std::string> extractFilesByExtension(const std::string& directory, const std::string& extension,
+                                                 std::size_t max_size) {
+    if (max_size == 0) {
+        return {};
+    }
+
+    std::vector<std::string> files;
+
+    DIR* dir = opendir(directory.c_str());
+    if (!dir) {
+        throw std::invalid_argument("Can not open " + directory);
+    }
+
+    auto getExtension = [](const std::string& name) {
+        auto extensionPosition = name.rfind('.', name.size());
+        return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1);
+    };
+
+    dirent* ent = nullptr;
+    while ((ent = readdir(dir)) && files.size() < max_size) {
+        std::string file_name = ent->d_name;
+        if (getExtension(file_name) != extension) {
+            continue;
+        }
+
+        std::string full_file_name = directory + "/" + file_name;
+
+        struct stat st = {};
+        if (stat(full_file_name.c_str(), &st) != 0) {
+            continue;
+        }
+
+        bool is_directory = (st.st_mode & S_IFDIR) != 0;
+        if (is_directory) {
+            continue;
+        }
+
+        files.emplace_back(full_file_name);
+    }
+
+    closedir(dir);
+
+    return files;
+}
+
+void loadBinaryTensor(const std::string &binaryFileName, InferenceEngine::Blob::Ptr& blob) {
+    InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
+    if (tensDesc.getPrecision() != InferenceEngine::Precision::FP16) {
+        throw std::invalid_argument("Input must have FP16 precision");
+    }
+
+    std::ifstream binaryFile(binaryFileName, std::ios_base::binary | std::ios_base::ate);
+    if (!binaryFile) {
+        throw std::invalid_argument("Can not open \"" + binaryFileName + "\"");
+    }
+
+    auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
+    binaryFile.seekg(0, std::ios_base::beg);
+    if (!binaryFile.good()) {
+        throw std::invalid_argument("Can not read \"" + binaryFileName + "\"");
+    }
+
+    auto expected_size = blob->size();
+    if (fileSize != 4 * expected_size) {
+        throw std::invalid_argument("File \"" + binaryFileName + "\" contains " + std::to_string(fileSize) + " bytes "
+                                    "but network expects " + std::to_string(expected_size));
+    }
+    /* try to read 32 bits data */
+    std::int16_t *blobDataPtr = std::dynamic_pointer_cast<InferenceEngine::TBlob<std::int16_t>>(blob)->data();
+    for (std::size_t i = 0; i < blob->size(); i++) {
+        float tmp = 0.f;
+        binaryFile.read(reinterpret_cast<char *>(&tmp), sizeof(float));
+        blobDataPtr[i] = InferenceEngine::PrecisionUtils::f32tof16(tmp);
+    }
+}
diff --git a/inference-engine/tools/vpu/common/vpu_tools_common.hpp b/inference-engine/tools/vpu/common/vpu_tools_common.hpp
new file mode 100644 (file)
index 0000000..b9befe8
--- /dev/null
@@ -0,0 +1,83 @@
+//
+// Copyright (C) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "inference_engine.hpp"
+
+InferenceEngine::CNNNetwork readNetwork(const std::string &xmlFileName);
+
+InferenceEngine::InferencePlugin loadPlugin(const std::string &plugin, const std::string &plugin_path);
+
+/* Set all precisions to FP16 */
+void setPrecisions(const InferenceEngine::CNNNetwork &network);
+
+std::map<std::string, std::string> parseConfig(const std::string &configName, char comment = '#');
+
+class BitMap {
+private:
+    typedef struct {
+        unsigned short type;                /* Magic identifier            */
+        unsigned int size;                  /* File size in bytes          */
+        unsigned int reserved;
+        unsigned int offset;                /* Offset to image data, bytes */
+    } BmpHeader;
+
+    typedef struct {
+        unsigned int size;                  /* Header size in bytes      */
+        int width, height;                  /* Width and height of image */
+        unsigned short planes;              /* Number of colour planes   */
+        unsigned short bits;                /* Bits per pixel            */
+        unsigned int compression;           /* Compression type          */
+        unsigned int imagesize;             /* Image size in bytes       */
+        int xresolution, yresolution;       /* Pixels per meter          */
+        unsigned int ncolours;              /* Number of colours         */
+        unsigned int importantcolours;      /* Important colours         */
+    } BmpInfoHeader;
+
+public:
+    explicit BitMap(const std::string &filename);
+
+    ~BitMap() = default;
+
+    size_t _height = 0;
+    size_t _width = 0;
+    std::shared_ptr<unsigned char> _data;
+
+public:
+    size_t size() const { return _width * _height * 3; }
+    size_t width() const { return _width; }
+    size_t height() const { return _height; }
+
+    std::shared_ptr<unsigned char> getData() {
+        return _data;
+    }
+};
+
+void loadImage(const std::string &imageFilename, InferenceEngine::Blob::Ptr &blob);
+
+void printPerformanceCounts(const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& perfMap);
+
+std::vector<std::string> extractFilesByExtension(const std::string& directory, const std::string& extension);
+std::vector<std::string> extractFilesByExtension(const std::string& directory, const std::string& extension,
+                                                 std::size_t max_size);
+
+void loadBinaryTensor(const std::string &binaryFileName, InferenceEngine::Blob::Ptr& blob);
diff --git a/inference-engine/tools/vpu/vpu_compile/CMakeLists.txt b/inference-engine/tools/vpu/vpu_compile/CMakeLists.txt
new file mode 100644 (file)
index 0000000..94830ba
--- /dev/null
@@ -0,0 +1,53 @@
+# Copyright (C) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+find_package(Threads REQUIRED)
+
+set(TARGET_NAME myriad_compile)
+
+file(GLOB SRCS
+    ${CMAKE_SOURCE_DIR}/tools/vpu/common/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+)
+
+add_executable(${TARGET_NAME} ${SRCS})
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(${TARGET_NAME} PRIVATE
+        "-Wall"
+    )
+endif()
+
+target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+    ${CMAKE_SOURCE_DIR}/samples/common
+    ${CMAKE_SOURCE_DIR}/include
+    ${CMAKE_SOURCE_DIR}/tools/vpu/common
+    ${CMAKE_SOURCE_DIR}/src/inference_engine
+)
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+    ${CMAKE_DL_LIBS}
+    Threads::Threads
+    inference_engine vpu_graph_transformer
+    gflags
+)
+
+add_dependencies(${TARGET_NAME} myriadPlugin vpu_copy_firmware)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    COMPILE_PDB_NAME
+    ${TARGET_NAME}
+)
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
diff --git a/inference-engine/tools/vpu/vpu_compile/README.md b/inference-engine/tools/vpu/vpu_compile/README.md
new file mode 100644 (file)
index 0000000..4964430
--- /dev/null
@@ -0,0 +1,79 @@
+# myriad_compile tool
+
+This topic demonstrates how to run the `myriad_compile` tool application, which intended to dump blob for `vpu` plugins of Inference Engine by configuration options.
+
+## How It Works
+
+Upon the start-up, the tool application reads command line parameters and loads a network to the Inference Engine plugin.
+Then application exports blob and writes it to the output file.
+
+## Running
+
+Running the application with the <code>-h</code> option yields the following usage message:
+
+```sh
+./myriad_compile -h
+Inference Engine:
+        API version ............ <version>
+        Build .................. <build>
+
+myriad_compile [OPTIONS]
+[OPTIONS]:
+    -h                                       Optional. Print a usage message.
+    -m                           <value>     Required. Path to xml model.
+    -pp                          <value>     Optional. Path to a plugin folder.
+    -o                           <value>     Optional. Path to the output file. Default value: "<model_xml_file>.blob".
+    -c                           <value>     Optional. Path to the configuration file. Default value: "config".
+    -ip                          <value>     Optional. Specifies precision for all input layers of network. Supported values: FP32, FP16, U8. Default value: FP16.
+    -op                          <value>     Optional. Specifies precision for all output layers of network. Supported values: FP32, FP16, U8. Default value: FP16.
+    -iop                        "<value>"    Optional. Specifies precision for input/output layers by name.
+                                             By default all inputs and outputs have FP16 precision.
+                                             Available precisions: FP32, FP16, U8.
+                                             Example: -iop "input:FP16, output:FP16".
+                                             Notice that quotes are required.
+    -VPU_PLATFORM                <value>     Optional. Specifies movidius platform. Supported values: VPU_2450, VPU_2480. Overwrites value from config.
+    -VPU_NUMBER_OF_SHAVES        <value>     Optional. Specifies number of shaves. Should be set with "VPU_NUMBER_OF_CMX_SLICES". Overwrites value from config.
+    -VPU_NUMBER_OF_CMX_SLICES    <value>     Optional. Specifies number of CMX slices. Should be set with "VPU_NUMBER_OF_SHAVES". Overwrites value from config.
+```
+
+Running the application with the empty list of options yields an error message.
+
+You can use the following command to dump blob using a trained Faster R-CNN network:
+
+```sh
+./myriad_compile -m <path_to_model>/model_name.xml
+```
+## Platform option
+You can dump blob without a connected Myriad device.
+To do that, you must specify type of movidius platform using the parameter --VPU_PLATFORM.    
+Supported values: VPU_2450, VPU_2480
+
+## Import and Export functionality
+#### Export
+You can save a blob file from your application.
+To do this, you should call the `Export()` method on the `ExecutableNetwork` object.
+`Export()` has the following argument:
+* Name of output blob [IN]
+
+Example:   
+```sh 
+InferenceEngine::ExecutableNetwork executableNetwork = plugin.LoadNetwork(network,{});    
+executableNetwork.Export("model_name.blob");     
+``` 
+
+#### Import
+You can upload blob with network into your application.
+To do this, you should call the `ImportNetwork()` method on the `InferencePlugin` object.
+`ImportNetwork()` has the following arguments:
+*   ExecutableNetwork [OUT]
+*  Path to blob [IN]
+*  Config options [IN] 
+
+Example:   
+```sh   
+std::string modelFilename ("model_name.blob");    
+InferenceEngine::IExecutableNetwork::Ptr importedNetworkPtr;     
+pluginPtr->ImportNetwork(importedNetworkPtr, modelFilename, {});    
+```
+
+> **NOTE**: Models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer).
diff --git a/inference-engine/tools/vpu/vpu_compile/main.cpp b/inference-engine/tools/vpu/vpu_compile/main.cpp
new file mode 100644 (file)
index 0000000..aadd3cf
--- /dev/null
@@ -0,0 +1,260 @@
+//
+// Copyright (C) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <string>
+
+#include <gflags/gflags.h>
+
+#include "inference_engine.hpp"
+#include <vpu/private_plugin_config.hpp>
+#include "samples/common.hpp"
+#include "vpu/utils/string.hpp"
+
+#include "vpu_tools_common.hpp"
+
+static constexpr char help_message[] = "Optional. Print a usage message.";
+static constexpr char model_message[] = "Required. Path to xml model.";
+static constexpr char plugin_path_message[] = "Optional. Path to a plugin folder.";
+static constexpr char output_message[] = "Optional. Path to the output file. Default value: \"<model_xml_file>.blob\".";
+static constexpr char config_message[] = "Optional. Path to the configuration file. Default value: \"config\".";
+static constexpr char platform_message[] = "Optional. Specifies movidius platform."
+                                           " Supported values: VPU_2450, VPU_2480."
+                                           " Overwrites value from config.";
+static constexpr char number_of_shaves_message[] = "Optional. Specifies number of shaves."
+                                                   " Should be set with \"VPU_NUMBER_OF_CMX_SLICES\"."
+                                                   " Overwrites value from config.";
+static constexpr char number_of_cmx_slices_message[] = "Optional. Specifies number of CMX slices."
+                                                       " Should be set with \"VPU_NUMBER_OF_SHAVES\"."
+                                                       " Overwrites value from config.";
+static constexpr char inputs_precision_message[] = "Optional. Specifies precision for all input layers of network."
+                                                   " Supported values: FP32, FP16, U8. Default value: FP16.";
+static constexpr char outputs_precision_message[] = "Optional. Specifies precision for all output layers of network."
+                                                    " Supported values: FP32, FP16, U8. Default value: FP16.";
+static constexpr char iop_message[] = "Optional. Specifies precision for input/output layers by name.\n"
+"                                             By default all inputs and outputs have FP16 precision.\n"
+"                                             Available precisions: FP32, FP16, U8.\n"
+"                                             Example: -iop \"input:FP16, output:FP16\".\n"
+"                                             Notice that quotes are required.\n"
+"                                             Overwrites precision from ip and op options for specified layers.";
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(m, "", help_message);
+DEFINE_string(pp, "", plugin_path_message);
+DEFINE_string(o, "", output_message);
+DEFINE_string(c, "config", config_message);
+DEFINE_string(ip, "", inputs_precision_message);
+DEFINE_string(op, "", outputs_precision_message);
+DEFINE_string(iop, "", iop_message);
+DEFINE_string(VPU_PLATFORM, "", platform_message);
+DEFINE_string(VPU_NUMBER_OF_SHAVES, "", number_of_shaves_message);
+DEFINE_string(VPU_NUMBER_OF_CMX_SLICES, "", number_of_cmx_slices_message);
+
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "myriad_compile [OPTIONS]" << std::endl;
+    std::cout << "[OPTIONS]:" << std::endl;
+    std::cout << "    -h                                       " << help_message << std::endl;
+    std::cout << "    -m                           <value>     " << model_message << std::endl;
+    std::cout << "    -pp                          <value>     " << plugin_path_message << std::endl;
+    std::cout << "    -o                           <value>     " << output_message << std::endl;
+    std::cout << "    -c                           <value>     " << config_message << std::endl;
+    std::cout << "    -ip                          <value>     " << inputs_precision_message << std::endl;
+    std::cout << "    -op                          <value>     " << outputs_precision_message << std::endl;
+    std::cout << "    -iop                        \"<value>\"    " << iop_message << std::endl;
+    std::cout << "    -VPU_PLATFORM                <value>     " << platform_message << std::endl;
+    std::cout << "    -VPU_NUMBER_OF_SHAVES        <value>     " << number_of_shaves_message << std::endl;
+    std::cout << "    -VPU_NUMBER_OF_CMX_SLICES    <value>     " << number_of_cmx_slices_message << std::endl;
+    std::cout << std::endl;
+}
+
+static bool parseCommandLine(int *argc, char ***argv) {
+    gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
+
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+
+    if (FLAGS_m.empty()) {
+        throw std::invalid_argument("Path to model xml file is required");
+    }
+
+    if (1 < *argc) {
+        std::stringstream message;
+        message << "Unknown arguments: ";
+        for (auto arg = 1; arg < *argc; arg++) {
+            message << argv[arg];
+            if (arg < *argc) {
+                message << " ";
+            }
+        }
+        throw std::invalid_argument(message.str());
+    }
+
+    return true;
+}
+
+static std::map<std::string, std::string> configure(const std::string &configFile, const std::string &xmlFileName) {
+    auto config = parseConfig(configFile);
+
+    if (!FLAGS_VPU_PLATFORM.empty()) {
+        config[VPU_CONFIG_KEY(PLATFORM)] = FLAGS_VPU_PLATFORM;
+    }
+
+    if (!FLAGS_VPU_NUMBER_OF_SHAVES.empty()) {
+        config[VPU_CONFIG_KEY(NUMBER_OF_SHAVES)] = FLAGS_VPU_NUMBER_OF_SHAVES;
+    }
+
+    if (!FLAGS_VPU_NUMBER_OF_CMX_SLICES.empty()) {
+        config[VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES)] = FLAGS_VPU_NUMBER_OF_CMX_SLICES;
+    }
+
+    auto modelConfigFile = fileNameNoExt(xmlFileName) + ".conf.xml";
+    {
+        std::ifstream file(modelConfigFile);
+        if (!file.is_open()) {
+            modelConfigFile.clear();
+        }
+    }
+
+    if (!modelConfigFile.empty()) {
+        config[VPU_CONFIG_KEY(NETWORK_CONFIG)] = "file=" + modelConfigFile;
+    }
+
+    return config;
+}
+
+static std::map<std::string, std::string> parsePrecisions(const std::string &iop) {
+    std::string user_input = iop;
+    user_input.erase(std::remove_if(user_input.begin(), user_input.end(), ::isspace), user_input.end());
+
+    std::vector<std::string> inputs;
+    vpu::splitStringList(user_input, inputs, ',');
+
+    std::map<std::string, std::string> precisions;
+    for (auto &&input : inputs) {
+        std::vector<std::string> precision;
+        vpu::splitStringList(input, precision, ':');
+        if (precision.size() != 2) {
+            throw std::invalid_argument("Invalid precision " + input + ". Expected layer_name : precision_value");
+        }
+
+        precisions[precision[0]] = precision[1];
+    }
+
+    return precisions;
+}
+
+static InferenceEngine::Precision getPrecision(const std::string &value) {
+    static const std::unordered_map<std::string, InferenceEngine::Precision> supported_precisions = {
+         { "FP32", InferenceEngine::Precision::FP32 },
+         { "FP16", InferenceEngine::Precision::FP16 },
+         { "U8", InferenceEngine::Precision::U8 }
+    };
+
+    std::string upper_value = value;
+    std::transform(value.begin(), value.end(), upper_value.begin(), ::toupper);
+    auto precision = supported_precisions.find(upper_value);
+    if (precision == supported_precisions.end()) {
+        throw std::logic_error(value + " is not a valid precision");
+    }
+
+    return precision->second;
+}
+
+void setPrecisions(const InferenceEngine::CNNNetwork &network, const std::string &iop) {
+    auto precisions = parsePrecisions(iop);
+    auto inputs = network.getInputsInfo();
+    auto outputs = network.getOutputsInfo();
+
+    for (auto &&layer : precisions) {
+        auto name = layer.first;
+        auto precision = getPrecision(layer.second);
+
+        auto input_precision = inputs.find(name);
+        auto output_precision = outputs.find(name);
+
+        if (input_precision != inputs.end()) {
+            input_precision->second->setPrecision(precision);
+        } else if (output_precision != outputs.end()) {
+            output_precision->second->setPrecision(precision);
+        } else {
+            throw std::logic_error(name + " is not an input neither output");
+        }
+    }
+}
+
+static void processPrecisions(InferenceEngine::CNNNetwork &network,
+                              const std::string &inputs_precision, const std::string &outputs_precision,
+                              const std::string &iop) {
+    setPrecisions(network);
+
+    if (!inputs_precision.empty()) {
+        auto precision = getPrecision(inputs_precision);
+        for (auto &&layer : network.getInputsInfo()) {
+            layer.second->setPrecision(precision);
+        }
+    }
+
+    if (!outputs_precision.empty()) {
+        auto precision = getPrecision(outputs_precision);
+        for (auto &&layer : network.getOutputsInfo()) {
+            layer.second->setPrecision(precision);
+        }
+    }
+
+    if (!iop.empty()) {
+        setPrecisions(network, iop);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    try {
+        std::cout << "Inference Engine: " << InferenceEngine::GetInferenceEngineVersion() << std::endl;
+
+        if (!parseCommandLine(&argc, &argv)) {
+            return EXIT_SUCCESS;
+        }
+
+        auto network = readNetwork(FLAGS_m);
+
+        processPrecisions(network, FLAGS_ip, FLAGS_op, FLAGS_iop);
+
+        auto plugin = loadPlugin("MYRIAD", FLAGS_pp);
+        InferenceEngine::ExecutableNetwork executableNetwork = plugin.LoadNetwork(network, configure(FLAGS_c, FLAGS_m));
+
+        std::string outputName = FLAGS_o;
+        if (outputName.empty()) {
+            outputName = fileNameNoExt(FLAGS_m) + ".blob";
+        }
+        executableNetwork.Export(outputName);
+    } catch (const std::exception &error) {
+        std::cerr << error.what() << std::endl;
+        return EXIT_FAILURE;
+    } catch (...) {
+        std::cerr << "Unknown/internal exception happened." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/inference-engine/tools/vpu/vpu_profile/CMakeLists.txt b/inference-engine/tools/vpu/vpu_profile/CMakeLists.txt
new file mode 100644 (file)
index 0000000..d954974
--- /dev/null
@@ -0,0 +1,52 @@
+#
+# Copyright (C) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(TARGET_NAME vpu_profile)
+
+find_package(Threads REQUIRED)
+
+file(GLOB SOURCES
+    ${CMAKE_SOURCE_DIR}/tools/vpu/common/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+)
+
+add_executable(${TARGET_NAME} ${SOURCES})
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(${TARGET_NAME} PRIVATE
+        "-Wall"
+    )
+endif()
+
+target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+    "${IE_MAIN_SOURCE_DIR}/include"
+    "${IE_MAIN_SOURCE_DIR}/src/inference_engine"
+    "${IE_MAIN_SOURCE_DIR}/samples/common"
+    "${IE_MAIN_SOURCE_DIR}/samples/common/format_reader"
+    "${CMAKE_SOURCE_DIR}/tools/vpu/common"
+)
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+    inference_engine format_reader vpu_graph_transformer
+    ${CMAKE_DL_LIBS}
+    Threads::Threads
+    gflags
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
+
diff --git a/inference-engine/tools/vpu/vpu_profile/README.md b/inference-engine/tools/vpu/vpu_profile/README.md
new file mode 100644 (file)
index 0000000..3de1f89
--- /dev/null
@@ -0,0 +1,43 @@
+# vpu_profile tool
+
+This topic demonstrates how to run the `vpu_profile` tool application, which estimates performance by calculating average time of each stage in model.
+
+## How It Works
+
+Upon the start-up, the sample application reads command line parameters and loads a network and its inputs from given directory to the Inference Engine plugin.
+Then application starts infer requests in asynchronous mode till specified number of iterations is finished.
+After inference stage, profile tool computes average time that each stage took.
+
+## Running
+
+Running the application with the <code>-h</code> option yields the following usage message:
+
+```sh
+Inference Engine:
+       API version ............ <version>
+       Build .................. <number>
+
+vpu_profile [OPTIONS]
+[OPTIONS]:
+       -help                           Optional. Print a usage message.
+       -model           <value>        Required. Path to xml model.
+       -inputs_dir      <value>        Required. Path to folder with images. Default: ".".
+       -plugin_path     <value>        Optional. Path to a plugin folder.
+       -config          <value>        Optional. Path to the configuration file. Default value: "config".
+       -platform        <value>        Optional. Specifies movidius platform.
+       -iterations      <value>        Optional. Specifies number of iterations. Default value: 16.
+       -plugin          <value>        Optional. Specifies plugin. Supported values: myriad.
+                                       Default value: "myriad".
+```
+
+Running the application with the empty list of options yields an error.
+
+To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/).
+
+> **note**: before running the sample with a trained model, make sure the model is converted to the inference engine format (\*.xml + \*.bin) using the [model optimizer tool](./docs/mo_dg/deep_learning_model_optimizer_devguide.md).
+
+You can use the following command to do inference on images from a folder using a trained Faster R-CNN network:
+
+```sh
+./perfcheck -model <path_to_model>/faster_rcnn.xml -inputs_dir <path_to_inputs>
+```
diff --git a/inference-engine/tools/vpu/vpu_profile/main.cpp b/inference-engine/tools/vpu/vpu_profile/main.cpp
new file mode 100644 (file)
index 0000000..9d03234
--- /dev/null
@@ -0,0 +1,261 @@
+/*
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <memory>
+#include <map>
+#include <cmath>
+#include <future>
+#include <atomic>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <gflags/gflags.h>
+
+#include "inference_engine.hpp"
+#include "precision_utils.h"
+
+#include "vpu_tools_common.hpp"
+#include "vpu/vpu_plugin_config.hpp"
+#include "samples/common.hpp"
+
+static constexpr char help_message[]        = "Print a usage message.";
+static constexpr char model_message[]       = "Path to xml model.";
+static constexpr char inputs_dir_message[]  = "Path to folder with images. Default: \".\".";
+static constexpr char plugin_path_message[] = "Path to a plugin folder.";
+static constexpr char config_message[]      = "Path to the configuration file. Default value: \"config\".";
+static constexpr char platform_message[]    = "Specifies movidius platform.";
+static constexpr char iterations_message[]  = "Specifies number of iterations. Default value: 16.";
+static constexpr char plugin_message[]      = "Specifies plugin. Supported values: myriad.\n"
+    "\t            \t         \tDefault value: \"myriad\".";
+
+DEFINE_bool(h,                false, help_message);
+DEFINE_string(model,             "", model_message);
+DEFINE_string(inputs_dir,       ".", inputs_dir_message);
+DEFINE_string(plugin_path,       "", plugin_path_message);
+DEFINE_string(config,            "", config_message);
+DEFINE_string(platform,          "", platform_message);
+DEFINE_int32(iterations,         16, iterations_message);
+DEFINE_string(plugin,      "myriad", plugin_message);
+
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "vpu_profile [OPTIONS]" << std::endl;
+    std::cout << "[OPTIONS]:" << std::endl;
+    std::cout << "\t-help       \t         \t"   << help_message        << std::endl;
+    std::cout << "\t-model      \t <value> \t"   << model_message       << std::endl;
+    std::cout << "\t-inputs_dir \t <value> \t"   << inputs_dir_message  << std::endl;
+    std::cout << "\t-plugin_path\t <value> \t"   << plugin_path_message << std::endl;
+    std::cout << "\t-config     \t <value> \t"   << config_message      << std::endl;
+    std::cout << "\t-platform   \t <value> \t"   << platform_message    << std::endl;
+    std::cout << "\t-iterations \t <value> \t"   << iterations_message  << std::endl;
+    std::cout << "\t-plugin     \t <value> \t"   << plugin_message      << std::endl;
+    std::cout << std::endl;
+}
+
+static bool parseCommandLine(int *argc, char ***argv) {
+    gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
+
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+
+    if (FLAGS_model.empty()) {
+        throw std::invalid_argument("Path to model xml file is required");
+    }
+
+    if (1 < *argc) {
+        std::stringstream message;
+        message << "Unknown arguments: ";
+        for (auto arg = 1; arg < *argc; arg++) {
+            message << argv[arg];
+            if (arg < *argc) {
+                message << " ";
+            }
+        }
+        throw std::invalid_argument(message.str());
+    }
+
+    return true;
+}
+
+static std::map<std::string, std::string> configure(const std::string& confFileName) {
+    auto config = parseConfig(confFileName);
+
+    /* Since user can specify config file we probably can avoid it */
+    config[VPU_CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
+    config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
+    config[VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME)] = CONFIG_VALUE(YES);
+
+    return config;
+}
+
+template<typename T>
+static bool isImage(const T& blob) {
+    auto descriptor = blob->getTensorDesc();
+    if (descriptor.getLayout() != InferenceEngine::NCHW) {
+        return false;
+    }
+
+    auto channels = descriptor.getDims()[1];
+    return channels == 3;
+}
+
+static void loadInputs(std::size_t requestIdx, const std::vector<std::string>& images,
+                       const std::vector<std::string>& binaries, InferenceEngine::InferRequest& request,
+                       InferenceEngine::CNNNetwork& network) {
+    for (auto &&input : network.getInputsInfo()) {
+        auto blob = request.GetBlob(input.first);
+
+        if (isImage(blob)) {
+            loadImage(images[requestIdx % images.size()], blob);
+        } else {
+            loadBinaryTensor(binaries[requestIdx % binaries.size()], blob);
+        }
+    }
+}
+
+static std::string process_user_input(const std::string &src) {
+    std::string name = src;
+    std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+    name.erase(std::remove_if(name.begin(), name.end(), ::isspace), name.end());
+
+    return name;
+}
+
+static std::size_t getNumberRequests(const std::string &plugin) {
+    static const std::unordered_map<std::string, std::size_t> supported_plugins = {
+        { "MYRIAD", 4 },
+    };
+
+    auto num_requests = supported_plugins.find(plugin);
+    if (num_requests == supported_plugins.end()) {
+        throw std::invalid_argument("Unknown plugin " + plugin);
+    }
+
+    return num_requests->second;
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        std::cout << "Inference Engine: " << InferenceEngine::GetInferenceEngineVersion() << std::endl;
+
+        if (!parseCommandLine(&argc, &argv)) {
+            return EXIT_SUCCESS;
+        }
+
+        auto network = readNetwork(FLAGS_model);
+        setPrecisions(network);
+
+        auto user_plugin = process_user_input(FLAGS_plugin);
+
+        auto plugin = loadPlugin(user_plugin, FLAGS_plugin_path);
+        InferenceEngine::ExecutableNetwork executableNetwork = plugin.LoadNetwork(network, configure(FLAGS_config));
+
+        auto num_requests = getNumberRequests(user_plugin);
+
+        auto images = extractFilesByExtension(FLAGS_inputs_dir, "bmp", 1);
+        auto hasImageInput = [](const InferenceEngine::CNNNetwork &network) {
+            auto inputs = network.getInputsInfo();
+            auto isImageInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
+                return isImage(input.second);
+            };
+            return std::any_of(inputs.begin(), inputs.end(), isImageInput);
+        };
+
+        if (hasImageInput(network) && images.empty()) {
+            throw std::invalid_argument(FLAGS_inputs_dir + " does not contain images for network");
+        }
+
+        auto binaries = extractFilesByExtension(FLAGS_inputs_dir, "bin", 1);
+        auto hasBinaryInput = [](const InferenceEngine::CNNNetwork &network) {
+            auto inputs = network.getInputsInfo();
+            auto isBinaryInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
+                return !isImage(input.second);
+            };
+            return std::any_of(inputs.begin(), inputs.end(), isBinaryInput);
+        };
+
+        if (hasBinaryInput(network) && binaries.empty()) {
+            throw std::invalid_argument(FLAGS_inputs_dir + " does not contain binaries for network");
+        }
+
+        std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> performance;
+
+        std::atomic<std::size_t> iteration{0};
+        std::promise<void> done;
+        bool needStartAsync{true};
+        std::size_t profiledIteration = 2 * num_requests + FLAGS_iterations;
+
+        std::vector<InferenceEngine::InferRequest> requests(num_requests);
+        std::vector<std::size_t> current_iterations(num_requests);
+
+        using callback_t = std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>;
+
+        for (std::size_t request = 0; request < num_requests; ++request) {
+            requests[request] = executableNetwork.CreateInferRequest();
+            current_iterations[request] = 0;
+
+            loadInputs(request, images, binaries, requests[request], network);
+
+            callback_t callback =
+                [request, profiledIteration, &done, &needStartAsync, &performance, &iteration, &current_iterations]
+                (InferenceEngine::InferRequest inferRequest, InferenceEngine::StatusCode code) {
+                if (code != InferenceEngine::StatusCode::OK) {
+                    THROW_IE_EXCEPTION << "Infer request failed with code " << code;
+                }
+
+                auto current_iteration = current_iterations[request];
+                if (current_iteration == profiledIteration) {
+                    performance = inferRequest.GetPerformanceCounts();
+                    needStartAsync = false;
+                    done.set_value();
+                }
+
+                if (needStartAsync) {
+                    current_iterations[request] = iteration++;
+                    inferRequest.StartAsync();
+                }
+            };
+
+            requests[request].SetCompletionCallback<callback_t>(callback);
+        }
+
+        auto doneFuture = done.get_future();
+
+        for (std::size_t request = 0; request < num_requests; ++request) {
+            current_iterations[request] = iteration++;
+            requests[request].StartAsync();
+        }
+
+        doneFuture.wait();
+        printPerformanceCounts(performance);
+    } catch (const std::exception &error) {
+        std::cerr << error.what() << std::endl;
+        return EXIT_FAILURE;
+    } catch (...) {
+        std::cerr << "Unknown/internal exception happened." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
index 0bfdc65..736c351 100644 (file)
@@ -64,10 +64,7 @@ class MulQuantizeFuse(MiddleReplacementPattern):
 
         # Provisional limitation that related to binary quantization
         # TODO: Relax it beyond binarization case
-        # Provisional limitation that related to binary quantization
-        # TODO: Relax it beyond binarization case
-        if len(quantize.in_node(1).out_nodes()) != 1 or \
-                len(quantize.in_node(2).out_nodes()) != 1 or \
+        if len(quantize.in_node(1).out_nodes()) != 1 or len(quantize.in_node(2).out_nodes()) != 1 or \
                 len(quantize.in_node(3).out_nodes()) != 1 or len(quantize.in_node(4).out_nodes()) != 1 or \
                 quantize.levels != 2:
             log.debug('MulQuantizeFuse: cannot fuse because Quantize op has '
@@ -76,14 +73,38 @@ class MulQuantizeFuse(MiddleReplacementPattern):
 
         tensor_port, value_port = get_tensor_in_port(preop), get_value_in_port(preop)
 
+        # TODO: need some special processing for values that exactly equal to threshold
+
+        mul_val = value_port.data.get_value()
 
         # Need to flip output_low and output_high for those elements that have multiplier < 0
-        # TODO: need some special processing for values that exactly equal to threshold
-        if np.all(value_port.data.get_value() <= 0):
-            log.debug('MulQuantizeFuse: cannot fuse because Mul op has non-positive multipliers.')
+        if np.all(mul_val < 0):
+            mi_o_node = quantize.in_port(3).get_source()
+            ma_o_node = quantize.in_port(4).get_source()
+
+            quantize.in_port(3).disconnect()
+            quantize.in_port(4).disconnect()
+
+            mi_o_node.connect(quantize.in_port(4))
+            ma_o_node.connect(quantize.in_port(3))
+
+        elif np.any(mul_val < 0):
+            # Successful flipping will be done on broadcasted arrays
+
+            mi_o_val = quantize.in_port(3).data.get_value()
+            ma_o_val = quantize.in_port(4).data.get_value()
+            mul_val, mi_o_val, ma_o_val = [np.array(a) for a in np.broadcast_arrays(mul_val, mi_o_val, ma_o_val)]
+
+            neg_idx = np.where(mul_val < 0)
+            mi_o_val[neg_idx], ma_o_val[neg_idx] = ma_o_val[neg_idx], mi_o_val[neg_idx]
+
+            # TODO: revert broadcasting where unnecessary
+            quantize.in_port(3).data.set_value(mi_o_val)
+            quantize.in_port(4).data.set_value(ma_o_val)
 
-        quantize.in_port(1).data.set_value(quantize.in_port(1).data.get_value() / value_port.data.get_value())
-        quantize.in_port(2).data.set_value(quantize.in_port(2).data.get_value() / value_port.data.get_value())
+        quantize.in_port(1).data.set_value(quantize.in_port(1).data.get_value() / mul_val)
+        if quantize.in_node(1).id != quantize.in_node(2).id:
+            quantize.in_port(2).data.set_value(quantize.in_port(2).data.get_value() / mul_val)
 
         # Remove Mul as it no longer needed
         quantize.in_port(0).disconnect()
diff --git a/model-optimizer/extensions/middle/MulQuantizeFuse_test.py b/model-optimizer/extensions/middle/MulQuantizeFuse_test.py
new file mode 100644 (file)
index 0000000..e2077c9
--- /dev/null
@@ -0,0 +1,158 @@
+"""
+ Copyright (c) 2018-2019 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+from extensions.middle.MulQuantizeFuse import MulQuantizeFuse
+from mo.middle.passes.eliminate_test import build_graph
+from mo.middle.passes.fusing.fuse_linear_ops_test import compare_graphs
+
+# The dictionary with nodes attributes used to build various graphs. A key is the name of the node and the value is the
+# dictionary with node attributes.
+nodes = {
+    'x': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'},
+    'x_data': {'value': None, 'shape': np.array([1, 64, 56, 56]), 'kind': 'data'},
+
+    'mul_const': {'op': 'Const', 'type': 'Const', 'kind': 'op'},
+    'mul_const_data': {'value': np.array([]), 'shape': np.array([]), 'kind': 'data'},
+
+    'mul': {'op': 'Mul', 'kind': 'op'},
+    'mul_data': {'value': np.array([]), 'shape': np.array([]), 'kind': 'data'},
+
+    'mi_i': {'op': 'Const', 'type': 'Const', 'kind': 'op'},
+    'mi_i_data': {'value': np.array([-10]), 'shape': np.array([]), 'kind': 'data'},
+
+    'ma_i': {'op': 'Const', 'type': 'Const', 'kind': 'op'},
+    'ma_i_data': {'value': np.array([10]), 'shape': np.array([]), 'kind': 'data'},
+
+    'mi_o': {'op': 'Const', 'type': 'Const', 'kind': 'op'},
+    'mi_o_data': {'value': np.array([]), 'shape': np.array([]), 'kind': 'data'},
+
+    'ma_o': {'op': 'Const', 'type': 'Const', 'kind': 'op'},
+    'ma_o_data': {'value': np.array([]), 'shape': np.array([]), 'kind': 'data'},
+
+    'quantize': {'type': 'Quantize', 'value': None, 'kind': 'op', 'op': 'Quantize', 'levels': 2},
+    'quantize_data': {'value': None, 'shape': np.array([1, 64, 56, 56]), 'kind': 'data'},
+
+    'output': {'op': 'OpOutput', 'kind': 'op'},
+}
+
+edges = [
+    ('x', 'x_data'),
+    ('mul_const', 'mul_const_data'),
+    ('mul', 'mul_data'),
+    ('mi_i', 'mi_i_data'),
+    ('ma_i', 'ma_i_data'),
+    ('mi_o', 'mi_o_data'),
+    ('ma_o', 'ma_o_data'),
+    ('quantize', 'quantize_data'),
+    ('quantize_data', 'output'),
+
+    ('x_data', 'mul', {'in': 0}),
+    ('mul_const_data', 'mul', {'in': 1}),
+
+    ('mul_data', 'quantize', {'in': 0}),
+    ('mi_i_data', 'quantize', {'in': 1}),
+    ('ma_i_data', 'quantize', {'in': 2}),
+    ('mi_o_data', 'quantize', {'in': 3}),
+    ('ma_o_data', 'quantize', {'in': 4}),
+]
+
+edges_ref = [
+    ('x', 'x_data'),
+    ('mi_i', 'mi_i_data'),
+    ('ma_i', 'ma_i_data'),
+    ('mi_o', 'mi_o_data'),
+    ('ma_o', 'ma_o_data'),
+    ('quantize', 'quantize_data'),
+    ('quantize_data', 'output'),
+
+    ('x_data', 'quantize', {'in': 0}),
+    ('mi_i_data', 'quantize', {'in': 1}),
+    ('ma_i_data', 'quantize', {'in': 2}),
+    ('mi_o_data', 'quantize', {'in': 3}),
+    ('ma_o_data', 'quantize', {'in': 4}),
+]
+
+
+class MulQuantizeFuseTest(unittest.TestCase):
+    def test_1(self):
+        graph = build_graph(nodes, edges, {
+            'mul_const_data': {'shape': np.array([3, 1, 1]), 'value': np.broadcast_to(np.array([1]), (3, 1, 1))},
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1, 1, 1, 1]), 'value': np.broadcast_to(np.array([0]), (1, 1, 1, 1))},
+            'ma_o_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.broadcast_to(np.array([1]), (1, 3, 1, 1))},
+        }, nodes_with_edges_only=True)
+        graph.stage = 'middle'
+        graph_ref = build_graph(nodes, edges_ref, {
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1, 1, 1, 1]), 'value': np.broadcast_to(np.array([0]), (1, 1, 1, 1))},
+            'ma_o_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.broadcast_to(np.array([1]), (1, 3, 1, 1))},
+            'mi_i_data': {'shape': np.array([3, 1, 1]), 'value': np.broadcast_to(np.array([-10]), (3, 1, 1))},
+            'ma_i_data': {'shape': np.array([3, 1, 1]), 'value': np.broadcast_to(np.array([10]), (3, 1, 1))},
+        }, nodes_with_edges_only=True)
+
+        MulQuantizeFuse().find_and_replace_pattern(graph)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+
+        self.assertTrue(flag, resp)
+
+    def test_2(self):
+        graph = build_graph(nodes, edges, {
+            'mul_const_data': {'shape': np.array([1]), 'value': np.array([-1])},
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1]), 'value': np.array([0])},
+            'ma_o_data': {'shape': np.array([1]), 'value': np.array([1])},
+        }, nodes_with_edges_only=True)
+        graph.stage = 'middle'
+        graph_ref = build_graph(nodes, edges_ref, {
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1]), 'value': np.array([1])},
+            'ma_o_data': {'shape': np.array([1]), 'value': np.array([0])},
+            'mi_i_data': {'shape': np.array([1]), 'value': np.array([10])},
+            'ma_i_data': {'shape': np.array([1]), 'value': np.array([-10])},
+        }, nodes_with_edges_only=True)
+
+        MulQuantizeFuse().find_and_replace_pattern(graph)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+
+        self.assertTrue(flag, resp)
+
+    def test_3(self):
+        graph = build_graph(nodes, edges, {
+            'mul_const_data': {'shape': np.array([3, 1, 1]), 'value': np.array([[[-1]], [[1]], [[-1]]])},
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1, 1, 1, 1]), 'value': np.broadcast_to(np.array([0]), (1, 1, 1, 1))},
+            'ma_o_data': {'shape': np.array([1, 1, 1, 1]), 'value': np.broadcast_to(np.array([1]), (1, 1, 1, 1))},
+        }, nodes_with_edges_only=True)
+        graph.stage = 'middle'
+        graph_ref = build_graph(nodes, edges_ref, {
+            'quantize_data': {'shape': np.array([2, 3, 4, 4])},
+            'mi_o_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.array([[[1]], [[0]], [[1]]])},
+            'ma_o_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.array([[[0]], [[1]], [[0]]])},
+            'mi_i_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.array([[[10]], [[-10]], [[10]]])},
+            'ma_i_data': {'shape': np.array([1, 3, 1, 1]), 'value': np.array([[[-10]], [[10]], [[-10]]])},
+        }, nodes_with_edges_only=True)
+
+        MulQuantizeFuse().find_and_replace_pattern(graph)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+
+        self.assertTrue(flag, resp)